diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -56,6 +56,29 @@
 
 class AMDGPULowerModuleLDS : public ModulePass {
 
+  GlobalVariable *createPaddingObject(Module &M, uint64_t &CurrentOffset,
+                                      Align Alignment) {
+    uint64_t AlignValue = Alignment.value();
+    if (uint64_t Rem = CurrentOffset % AlignValue) {
+      uint64_t Padding = AlignValue - Rem;
+
+      // Append an array of padding bytes to meet alignment requested
+      // Note (o +      (a - (o % a)) ) % a == 0
+      //      (offset + Padding       ) % align == 0
+
+      Type *ATy = ArrayType::get(Type::getInt8Ty(M.getContext()), Padding);
+      GlobalVariable *PaddingObject = new GlobalVariable(
+          M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), "",
+          nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+
+      CurrentOffset += Padding;
+
+      return PaddingObject;
+    }
+
+    return nullptr;
+  }
+
   static void removeFromUsedList(Module &M, StringRef Name,
                                  SmallPtrSetImpl<Constant *> &ToRemove) {
     GlobalVariable *GV = M.getNamedGlobal(Name);
@@ -229,29 +252,19 @@
     {
       // This usually won't need to insert any padding, perhaps avoid the alloc
       uint64_t CurrentOffset = 0;
-      for (size_t I = 0; I < FoundLocalVars.size(); I++) {
-        GlobalVariable *FGV = FoundLocalVars[I];
-        Align DataAlign = AMDGPU::getAlign(DL, FGV);
-
-        uint64_t DataAlignV = DataAlign.value();
-        if (uint64_t Rem = CurrentOffset % DataAlignV) {
-          uint64_t Padding = DataAlignV - Rem;
-
-          // Append an array of padding bytes to meet alignment requested
-          // Note (o +      (a - (o % a)) ) % a == 0
-          //      (offset + Padding       ) % align == 0
-
-          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
-          LocalVars.push_back(new GlobalVariable(
-              M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
-              "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
-              false));
-          CurrentOffset += Padding;
+      for (auto *FGV : FoundLocalVars) {
+        Align Alignment = AMDGPU::getAlign(DL, FGV);
+        if (auto *PO = createPaddingObject(M, CurrentOffset, Alignment)) {
+          LocalVars.push_back(PO);
         }
-
         LocalVars.push_back(FGV);
         CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
       }
+
+      Align Alignment = AMDGPU::getAlign(DL, LocalVars[0]);
+      if (auto *PO = createPaddingObject(M, CurrentOffset, Alignment)) {
+        LocalVars.push_back(PO);
+      }
     }
 
     std::vector<Type *> LocalVarTypes;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
@@ -67,7 +67,7 @@
 ; specified.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_3:
 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
-; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x44, [[IDX]]
+; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x50, [[IDX]]
 define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx
@@ -82,7 +82,7 @@
 ; The offset to the dynamic shared memory array should be aligned on the
 ; maximal one.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_4:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
+; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
 define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
@@ -101,7 +101,7 @@
 
 ; Honor the explicit alignment from the specified variable.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_5:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
 define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
--- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
@@ -64,7 +64,7 @@
 ; The offset to the dynamic shared memory array should be aligned on the type
 ; specified.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_3:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -80,7 +80,7 @@
 ; The offset to the dynamic shared memory array should be aligned on the
 ; maximal one.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_4:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
+; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
@@ -99,7 +99,7 @@
 
 ; Honor the explicit alignment from the specified variable.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_5:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -12,9 +12,10 @@
 declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #0
 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #0
 
-
+; 38 + [10 pad] = 48
+;
 ; HSA-LABEL: {{^}}test_no_round_size_1:
-; HSA: workgroup_group_segment_byte_size = 38
+; HSA: workgroup_group_segment_byte_size = 48
 define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
@@ -22,17 +23,10 @@
   ret void
 }
 
-; There are two objects, so one requires padding to be correctly
-; aligned after the other.
-
-; (38 -> 48) + 38 = 92
-
-; I don't think it is necessary to add padding after since if there
-; were to be a dynamically sized LDS kernel arg, the runtime should
-; add the alignment padding if necessary alignment padding if needed.
-
+; 38 + (10 pad) + 38 + (10 pad) = 96
+;
 ; HSA-LABEL: {{^}}test_round_size_2:
-; HSA: workgroup_group_segment_byte_size = 86
+; HSA: workgroup_group_segment_byte_size = 96
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -46,9 +40,10 @@
   ret void
 }
 
-; 38 + (10 pad) + 38  (= 86)
+; 38 + (10 pad) + 38 + (10 pad) + (10 pad) = 96
+;
 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
-; HSA: workgroup_group_segment_byte_size = 86
+; HSA: workgroup_group_segment_byte_size = 96
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -62,8 +57,10 @@
   ret void
 }
 
+; 38 + (10 pad) = 48
+;
 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
-; HSA: workgroup_group_segment_byte_size = 38
+; HSA: workgroup_group_segment_byte_size = 48
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -75,6 +72,8 @@
   ret void
 }
 
+; FIXME: Parameter alignment not considered
+;
 ; HSA-LABEL: {{^}}test_round_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
@@ -85,6 +84,7 @@
 }
 
 ; FIXME: Parameter alignment not considered
+;
 ; HSA-LABEL: {{^}}test_high_align_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
@@ -94,9 +94,10 @@
   ret void
 }
 
-; (39 * 4) + (4 pad) + (7 * 8) = 216
+; (39 * 4) + (4 pad) + (7 * 8) + (8 pad) = 224
+;
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
-; HSA: workgroup_group_segment_byte_size = 216
+; HSA: workgroup_group_segment_byte_size = 224
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
@@ -110,9 +111,10 @@
   ret void
 }
 
-; (39 * 4) + (4 pad) + (7 * 8) = 216
+; (39 * 4) + (4 pad) + (7 * 8) + (8 pad) = 224
+;
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
-; HSA: workgroup_group_segment_byte_size = 216
+; HSA: workgroup_group_segment_byte_size = 224
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
@@ -126,10 +128,10 @@
   ret void
 }
 
-; align 32, 16, 16
-; 38 + (10 pad) + 38 + (10 pad) + 38  ( = 134)
+; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160
+;
 ; HSA-LABEL: {{^}}test_round_size_3_order0:
-; HSA: workgroup_group_segment_byte_size = 134
+; HSA: workgroup_group_segment_byte_size = 160
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
@@ -147,10 +149,10 @@
   ret void
 }
 
-; align 32, 16, 16
-; 38 (+ 10 pad) + 38 + (10 pad) + 38 ( = 134)
+; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160
+;
 ; HSA-LABEL: {{^}}test_round_size_3_order1:
-; HSA: workgroup_group_segment_byte_size = 134
+; HSA: workgroup_group_segment_byte_size = 160
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
@@ -168,10 +170,10 @@
   ret void
 }
 
-; align 32, 16, 16
-; 38 + (10 pad) + 38 + (10 pad) + 38  ( = 126)
+; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160
+;
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
-; HSA: workgroup_group_segment_byte_size = 134
+; HSA: workgroup_group_segment_byte_size = 160
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -189,10 +191,10 @@
   ret void
 }
 
-; align 32, 16, 16
-; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134)
+; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160
+;
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
-; HSA: workgroup_group_segment_byte_size = 134
+; HSA: workgroup_group_segment_byte_size = 160
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -210,10 +212,10 @@
   ret void
 }
 
-; align 32, 16, 16
-; 38 + (10 pad) + 38 + (10 pad) + 38  (= 134)
+; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160
+;
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
-; HSA: workgroup_group_segment_byte_size = 134
+; HSA: workgroup_group_segment_byte_size = 160
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
@@ -231,10 +233,10 @@
   ret void
 }
 
-; align 32, 16, 16
-; 38 + (10 pad) + 38 + (10 pad) + 38  (= 134)
+; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160
+;
 ; HSA-LABEL: {{^}}test_round_size_3_order5:
-; HSA: workgroup_group_segment_byte_size = 134
+; HSA: workgroup_group_segment_byte_size = 160
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@@ -7,9 +7,9 @@
 @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
 
-; CHECK: %llvm.amdgcn.module.lds.t = type { [8 x i8], [1 x i8] }
-; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
-; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [8 x i8], [1 x i8], [7 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [10 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [10 x i8] }
 ; CHECK: %llvm.amdgcn.kernel..lds.t = type { [2 x i8] }
 ; CHECK: %llvm.amdgcn.kernel..lds.t.0 = type { [4 x i8] }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
@@ -13,7 +13,7 @@
 ; @lds.6:  is part of @llvm.compiler.used list, but is not used within kernel, hence it is not lowered.
 ;.
 
-; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { i32, i16 }
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { i32, i16, [2 x i8] }
 
 ; CHECK-NOT: @lds.1
 ; CHECK-NOT: @lds.2
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
@@ -7,8 +7,8 @@
 @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
 
-; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [1 x i8] }
-; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [1 x i8], [9 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [10 x i8] }
 
 ;.
 ; CHECK: @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
@@ -16,7 +16,7 @@
 ;          But none of them are used anywhere. Hence, @lds.6 is not lowered.
 ;.
 
-; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8], [1 x i8] }
 
 ; CHECK-NOT: @lds.1
 ; CHECK-NOT: @lds.2
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
@@ -17,7 +17,7 @@
 ;          and @gptr.8 is used within non-kernel function @f1. Hence @lds.7 is lowered.
 ;.
 
-; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float], [4 x i8] }
 
 ; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2
 ; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
-; CHECK: %llvm.amdgcn.module.lds.t = type { double, float }
+; CHECK: %llvm.amdgcn.module.lds.t = type { double, float, [4 x i8] }
 
 ; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to float*), align 8
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -3,7 +3,7 @@
 
 ; Check new struct is added to compiler.used and that the replaced variable is removed
 
-; CHECK: %llvm.amdgcn.module.lds.t = type { float }
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8] }
 ; CHECK: @ignored = addrspace(1) global i64 0
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@@ -3,7 +3,7 @@
 
 ; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2
 ; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4).
-; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 }
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32, [4 x i8] }
 
 ; Variables removed by pass
 ; CHECK-NOT: @var0
diff --git a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
--- a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
@@ -2,22 +2,22 @@
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
 ; Properly aligned, same size as alignment.
-; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8], [1 x i8] }
 
 ; Different properly aligned values, but same size of 1.
 ; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [7 x i8], [1 x i8], [3 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8] }
 
 ; All are under-aligned, requires to fix each on different alignment boundary.
-; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8], [2 x i8] }
 
 ; All LDS are underaligned, requires to allocate on 8 byte boundary
-; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8], [3 x i8] }
 
 ; All LDS are underaligned, requires to allocate on 16 byte boundary
-; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [12 x i8], [4 x i8], [11 x i8], [5 x i8], [10 x i8], [6 x i8], [9 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [12 x i8], [4 x i8], [11 x i8], [5 x i8], [10 x i8], [6 x i8], [9 x i8], [7 x i8] }
 
 ; All LDS are properly aligned on 16 byte boundary, but they are of different size.
-; CHECK: %llvm.amdgcn.kernel.k5.lds.t = type { [20 x i8], [12 x i8], [19 x i8], [13 x i8], [18 x i8], [14 x i8], [17 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k5.lds.t = type { [20 x i8], [12 x i8], [19 x i8], [13 x i8], [18 x i8], [14 x i8], [17 x i8], [15 x i8] }
 
 ; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16
 ; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16