diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -56,6 +56,29 @@ class AMDGPULowerModuleLDS : public ModulePass { + GlobalVariable *createPaddingObject(Module &M, uint64_t &CurrentOffset, + Align Alignment) { + uint64_t AlignValue = Alignment.value(); + if (uint64_t Rem = CurrentOffset % AlignValue) { + uint64_t Padding = AlignValue - Rem; + + // Append an array of padding bytes to meet alignment requested + // Note (o + (a - (o % a)) ) % a == 0 + // (offset + Padding ) % align == 0 + + Type *ATy = ArrayType::get(Type::getInt8Ty(M.getContext()), Padding); + GlobalVariable *PaddingObject = new GlobalVariable( + M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), "", + nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); + + CurrentOffset += Padding; + + return PaddingObject; + } + + return nullptr; + } + static void removeFromUsedList(Module &M, StringRef Name, SmallPtrSetImpl &ToRemove) { GlobalVariable *GV = M.getNamedGlobal(Name); @@ -229,29 +252,19 @@ { // This usually won't need to insert any padding, perhaps avoid the alloc uint64_t CurrentOffset = 0; - for (size_t I = 0; I < FoundLocalVars.size(); I++) { - GlobalVariable *FGV = FoundLocalVars[I]; - Align DataAlign = AMDGPU::getAlign(DL, FGV); - - uint64_t DataAlignV = DataAlign.value(); - if (uint64_t Rem = CurrentOffset % DataAlignV) { - uint64_t Padding = DataAlignV - Rem; - - // Append an array of padding bytes to meet alignment requested - // Note (o + (a - (o % a)) ) % a == 0 - // (offset + Padding ) % align == 0 - - Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); - LocalVars.push_back(new GlobalVariable( - M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), - "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, - false)); - CurrentOffset += Padding; + for (auto *FGV : FoundLocalVars) { + Align Alignment = AMDGPU::getAlign(DL, FGV); + if (auto *PO = createPaddingObject(M, CurrentOffset, Alignment)) { + LocalVars.push_back(PO); } - LocalVars.push_back(FGV); CurrentOffset += DL.getTypeAllocSize(FGV->getValueType()); } + + Align Alignment = AMDGPU::getAlign(DL, LocalVars[0]); + if (auto *PO = createPaddingObject(M, CurrentOffset, Alignment)) { + LocalVars.push_back(PO); + } } std::vector LocalVarTypes; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll @@ -67,7 +67,7 @@ ; specified. ; CHECK-LABEL: {{^}}dynamic_shared_array_3: ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} -; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x44, [[IDX]] +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x50, [[IDX]] define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() %vidx = add i32 %tid.x, %idx @@ -82,7 +82,7 @@ ; The offset to the dynamic shared memory array should be aligned on the ; maximal one. ; CHECK-LABEL: {{^}}dynamic_shared_array_4: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} ; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { @@ -101,7 +101,7 @@ ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_5: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} ; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll --- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll +++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll @@ -64,7 +64,7 @@ ; The offset to the dynamic shared memory array should be aligned on the type ; specified. ; CHECK-LABEL: {{^}}dynamic_shared_array_3: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -80,7 +80,7 @@ ; The offset to the dynamic shared memory array should be aligned on the ; maximal one. ; CHECK-LABEL: {{^}}dynamic_shared_array_4: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { @@ -99,7 +99,7 @@ ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_5: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 +; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]] ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]] define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll @@ -12,9 +12,10 @@ declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #0 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #0 - +; 38 + [10 pad] = 48 +; ; HSA-LABEL: {{^}}test_no_round_size_1: -; HSA: workgroup_group_segment_byte_size = 38 +; HSA: workgroup_group_segment_byte_size = 48 define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) @@ -22,17 +23,10 @@ ret void } -; There are two objects, so one requires padding to be correctly -; aligned after the other. - -; (38 -> 48) + 38 = 92 - -; I don't think it is necessary to add padding after since if there -; were to be a dynamically sized LDS kernel arg, the runtime should -; add the alignment padding if necessary alignment padding if needed. - +; 38 + (10 pad) + 38 + (10 pad) = 96 +; ; HSA-LABEL: {{^}}test_round_size_2: -; HSA: workgroup_group_segment_byte_size = 86 +; HSA: workgroup_group_segment_byte_size = 96 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -46,9 +40,10 @@ ret void } -; 38 + (10 pad) + 38 (= 86) +; 38 + (10 pad) + 38 + (10 pad) + (10 pad) = 96 +; ; HSA-LABEL: {{^}}test_round_size_2_align_8: -; HSA: workgroup_group_segment_byte_size = 86 +; HSA: workgroup_group_segment_byte_size = 96 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -62,8 +57,10 @@ ret void } +; 38 + (10 pad) = 48 +; ; HSA-LABEL: {{^}}test_round_local_lds_and_arg: -; HSA: workgroup_group_segment_byte_size = 38 +; HSA: workgroup_group_segment_byte_size = 48 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -75,6 +72,8 @@ ret void } +; FIXME: Parameter alignment not considered +; ; HSA-LABEL: {{^}}test_round_lds_arg: ; HSA: workgroup_group_segment_byte_size = 0 ; HSA: group_segment_alignment = 4 @@ -85,6 +84,7 @@ } ; FIXME: Parameter alignment not considered +; ; HSA-LABEL: {{^}}test_high_align_lds_arg: ; HSA: workgroup_group_segment_byte_size = 0 ; HSA: group_segment_alignment = 4 @@ -94,9 +94,10 @@ ret void } -; (39 * 4) + (4 pad) + (7 * 8) = 216 +; (39 * 4) + (4 pad) + (7 * 8) + (8 pad) = 224 +; ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: -; HSA: workgroup_group_segment_byte_size = 216 +; HSA: workgroup_group_segment_byte_size = 224 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* @@ -110,9 +111,10 @@ ret void } -; (39 * 4) + (4 pad) + (7 * 8) = 216 +; (39 * 4) + (4 pad) + (7 * 8) + (8 pad) = 224 +; ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: -; HSA: workgroup_group_segment_byte_size = 216 +; HSA: workgroup_group_segment_byte_size = 224 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* @@ -126,10 +128,10 @@ ret void } -; align 32, 16, 16 -; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134) +; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160 +; ; HSA-LABEL: {{^}}test_round_size_3_order0: -; HSA: workgroup_group_segment_byte_size = 134 +; HSA: workgroup_group_segment_byte_size = 160 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* @@ -147,10 +149,10 @@ ret void } -; align 32, 16, 16 -; 38 (+ 10 pad) + 38 + (10 pad) + 38 ( = 134) +; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160 +; ; HSA-LABEL: {{^}}test_round_size_3_order1: -; HSA: workgroup_group_segment_byte_size = 134 +; HSA: workgroup_group_segment_byte_size = 160 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* @@ -168,10 +170,10 @@ ret void } -; align 32, 16, 16 -; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 126) +; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160 +; ; HSA-LABEL: {{^}}test_round_size_3_order2: -; HSA: workgroup_group_segment_byte_size = 134 +; HSA: workgroup_group_segment_byte_size = 160 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -189,10 +191,10 @@ ret void } -; align 32, 16, 16 -; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134) +; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160 +; ; HSA-LABEL: {{^}}test_round_size_3_order3: -; HSA: workgroup_group_segment_byte_size = 134 +; HSA: workgroup_group_segment_byte_size = 160 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -210,10 +212,10 @@ ret void } -; align 32, 16, 16 -; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134) +; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160 +; ; HSA-LABEL: {{^}}test_round_size_3_order4: -; HSA: workgroup_group_segment_byte_size = 134 +; HSA: workgroup_group_segment_byte_size = 160 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* @@ -231,10 +233,10 @@ ret void } -; align 32, 16, 16 -; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134) +; 38 + (10 pad) + 38 + (10 pad) + 38 + (26 pad) = 160 +; ; HSA-LABEL: {{^}}test_round_size_3_order5: -; HSA: workgroup_group_segment_byte_size = 134 +; HSA: workgroup_group_segment_byte_size = 160 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -7,9 +7,9 @@ @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 -; CHECK: %llvm.amdgcn.module.lds.t = type { [8 x i8], [1 x i8] } -; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8] } -; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] } +; CHECK: %llvm.amdgcn.module.lds.t = type { [8 x i8], [1 x i8], [7 x i8] } +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [10 x i8] } +; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [10 x i8] } ; CHECK: %llvm.amdgcn.kernel..lds.t = type { [2 x i8] } ; CHECK: %llvm.amdgcn.kernel..lds.t.0 = type { [4 x i8] } diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll @@ -13,7 +13,7 @@ ; @lds.6: is part of @llvm.compiler.used list, but is not used within kernel, hence it is not lowered. ;. -; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { i32, i16 } +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { i32, i16, [2 x i8] } ; CHECK-NOT: @lds.1 ; CHECK-NOT: @lds.2 diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll @@ -7,8 +7,8 @@ @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 -; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [1 x i8] } -; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] } +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [1 x i8], [9 x i8] } +; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [10 x i8] } ;. ; CHECK: @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll @@ -16,7 +16,7 @@ ; But none of them are used anywhere. Hence, @lds.6 is not lowered. ;. -; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8] } +; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8], [1 x i8] } ; CHECK-NOT: @lds.1 ; CHECK-NOT: @lds.2 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll @@ -17,7 +17,7 @@ ; and @gptr.8 is used within non-kernel function @f1. Hence @lds.7 is lowered. ;. -; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] } +; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float], [4 x i8] } ; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2 ; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll @@ -1,7 +1,7 @@ ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s -; CHECK: %llvm.amdgcn.module.lds.t = type { double, float } +; CHECK: %llvm.amdgcn.module.lds.t = type { double, float, [4 x i8] } ; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to float*), align 8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll @@ -3,7 +3,7 @@ ; Check new struct is added to compiler.used and that the replaced variable is removed -; CHECK: %llvm.amdgcn.module.lds.t = type { float } +; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8] } ; CHECK: @ignored = addrspace(1) global i64 0 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -3,7 +3,7 @@ ; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2 ; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4). -; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 } +; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32, [4 x i8] } ; Variables removed by pass ; CHECK-NOT: @var0 diff --git a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll @@ -2,22 +2,22 @@ ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s ; Properly aligned, same size as alignment. -; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8] } +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8], [1 x i8] } ; Different properly aligned values, but same size of 1. ; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [7 x i8], [1 x i8], [3 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8] } ; All are under-aligned, requires to fix each on different alignment boundary. -; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8] } +; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8], [2 x i8] } ; All LDS are underaligned, requires to allocate on 8 byte boundary -; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8] } +; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8], [3 x i8] } ; All LDS are underaligned, requires to allocate on 16 byte boundary -; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [12 x i8], [4 x i8], [11 x i8], [5 x i8], [10 x i8], [6 x i8], [9 x i8] } +; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [12 x i8], [4 x i8], [11 x i8], [5 x i8], [10 x i8], [6 x i8], [9 x i8], [7 x i8] } ; All LDS are properly aligned on 16 byte boundary, but they are of different size. -; CHECK: %llvm.amdgcn.kernel.k5.lds.t = type { [20 x i8], [12 x i8], [19 x i8], [13 x i8], [18 x i8], [14 x i8], [17 x i8] } +; CHECK: %llvm.amdgcn.kernel.k5.lds.t = type { [20 x i8], [12 x i8], [19 x i8], [13 x i8], [18 x i8], [14 x i8], [17 x i8], [15 x i8] } ; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16 ; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16