diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -73,8 +73,6 @@ // The "module" lowering implemented here finds LDS variables which are used by // non-kernel functions and creates a new struct with a field for each of those // LDS variables. Variables that are only used from kernels are excluded. -// Kernels that do not use this struct are annoteated with the attribute -// amdgpu-elide-module-lds which allows the back end to elide the allocation. // // The "table" lowering implemented here has three components. // First kernels are assigned a unique integer identifier which is available in @@ -906,9 +904,6 @@ }); markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); - - } else { - markElideModuleLDS(Func); } } @@ -1106,14 +1101,6 @@ return KernelToCreatedDynamicLDS; } - static bool canElideModuleLDS(const Function &F) { - return F.hasFnAttribute("amdgpu-elide-module-lds"); - } - - static void markElideModuleLDS(Function &F) { - F.addFnAttr("amdgpu-elide-module-lds"); - } - bool runOnModule(Module &M) override { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -1231,7 +1218,7 @@ //} const bool AllocateModuleScopeStruct = - MaybeModuleScopeStruct && !canElideModuleLDS(Func); + MaybeModuleScopeStruct && KernelsThatAllocateModuleLDS.contains(&Func); auto Replacement = KernelToReplacement.find(&Func); const bool AllocateKernelScopeStruct = diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -90,7 +90,7 @@ ; kernel variable normal/overaligned ; extern variable normal/overaligned -define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 { +define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_normal: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -148,7 +148,7 @@ ret void } -define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1 { +define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -206,7 +206,7 @@ ret void } -define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1 { +define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -264,7 +264,7 @@ ret void } -define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) #1 { +define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -329,7 +329,7 @@ ; kernel variable normal/overaligned ; extern variable normal/overaligned -define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) #1 { +define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s8, s8, s11 @@ -396,7 +396,7 @@ ret void } -define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) #1 { +define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s8, s8, s11 @@ -463,7 +463,7 @@ ret void } -define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) #1 { +define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s8, s8, s11 @@ -530,7 +530,7 @@ ret void } -define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) #1 { +define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s8, s8, s11 @@ -599,4 +599,3 @@ attributes #0 = { noinline } -attributes #1 = { "amdgpu-elide-module-lds" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -22,7 +22,7 @@ ; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 4, !absolute_symbol !0 ;. define amdgpu_kernel void @k0() #0 { -; CHECK-LABEL: @k0( +; CHECK-LABEL: @k0() #0 ; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !1, !noalias !4 ; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !8, !noalias !9 ; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !10, !noalias !11 @@ -40,7 +40,7 @@ } define amdgpu_kernel void @k1() #0 { -; CHECK-LABEL: @k1( +; CHECK-LABEL: @k1() #1 ; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !14, !noalias !17 ; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !20, !noalias !21 ; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !22, !noalias !23 @@ -56,7 +56,7 @@ } define amdgpu_kernel void @k2() #0 { -; CHECK-LABEL: @k2( +; CHECK-LABEL: @k2() #2 ; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 2 ; CHECK-NEXT: ret void ; @@ -66,7 +66,7 @@ } define amdgpu_kernel void @k3() #0 { -; CHECK-LABEL: @k3( +; CHECK-LABEL: @k3() #3 ; CHECK-NEXT: store i8 4, ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, align 4 ; CHECK-NEXT: ret void ; @@ -75,14 +75,14 @@ ret void } - +; CHECK-LABEL: @calls_f0() #4 define amdgpu_kernel void @calls_f0() { call void @f0() ret void } define void @f0() { -; CHECK-LABEL: define void @f0( +; CHECK-LABEL: define void @f0() ; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 8, !noalias !24 ; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !24 ; CHECK-NEXT: ret void @@ -93,7 +93,10 @@ ret void } -attributes #0 = { "amdgpu-elide-module-lds" } -; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #0 = { "amdgpu-lds-size"="23" } +; CHECK: attributes #1 = { "amdgpu-lds-size"="22" } +; CHECK: attributes #2 = { "amdgpu-lds-size"="2" } +; CHECK: attributes #3 = { "amdgpu-lds-size"="4" } +; CHECK: attributes #4 = { "amdgpu-lds-size"="9" } ; CHECK: !0 = !{i64 0, i64 1} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -42,7 +42,7 @@ ; CHECK: %5 = inttoptr i64 %4 to ptr ; CHECK: store i32 %x, ptr %5, align 4 ; CHECK: ret void -define void @set_func(i32 %x) local_unnamed_addr #1 { +define void @set_func(i32 %x) { entry: store i32 %x, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64)) to ptr), align 4 ret void @@ -74,7 +74,7 @@ ret void } -; CHECK-LABEL: @through_functions() +; CHECK-LABEL: @through_functions() #0 define amdgpu_kernel void @through_functions() { %ld = call i32 @get_func() %mul = mul i32 %ld, 4 @@ -82,5 +82,4 @@ ret void } -attributes #0 = { "amdgpu-elide-module-lds" } -; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #0 = { "amdgpu-lds-size"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll @@ -34,7 +34,7 @@ define amdgpu_kernel void @kernel_only() { -; CHECK-LABEL: @kernel_only() #0 { +; CHECK-LABEL: @kernel_only() { ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0 ; CHECK-NEXT: store double 3.140000e+00, ptr addrspace(3) [[ARRAYIDX]], align 8 ; CHECK-NEXT: ret void @@ -45,8 +45,8 @@ } ; The accesses from functions are rewritten to go through the llvm.amdgcn.dynlds.offset.table -define void @use_shared1() #0 { -; CHECK-LABEL: @use_shared1() #1 { +define void @use_shared1() { +; CHECK-LABEL: @use_shared1() { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[DYNAMIC_SHARED1:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED1]], align 4 @@ -61,7 +61,7 @@ } define void @use_shared2() #0 { -; CHECK-LABEL: @use_shared2() #1 { +; CHECK-LABEL: @use_shared2() #0 { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[DYNAMIC_SHARED2:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED2]], align 4 @@ -78,7 +78,7 @@ ; Include a normal variable so that the new variables aren't all at the same absolute_symbol @static_shared = addrspace(3) global i32 undef define void @use_shared4() #0 { -; CHECK-LABEL: @use_shared4() #1 { +; CHECK-LABEL: @use_shared4() #0 { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: store i32 4, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 ; CHECK-NEXT: [[DYNAMIC_SHARED4:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] @@ -95,7 +95,7 @@ } define void @use_shared8() #0 { -; CHECK-LABEL: @use_shared8() #1 { +; CHECK-LABEL: @use_shared8() #0 { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[DYNAMIC_SHARED8:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4 @@ -111,7 +111,7 @@ ; The kernels are annotated with kernel.id and llvm.donothing use of the corresponding variable define amdgpu_kernel void @expect_align1() { -; CHECK-LABEL: @expect_align1() #0 !llvm.amdgcn.lds.kernel.id !2 +; CHECK-LABEL: @expect_align1() !llvm.amdgcn.lds.kernel.id !2 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds) ] ; CHECK-NEXT: call void @use_shared1() ; CHECK-NEXT: ret void @@ -121,7 +121,7 @@ } define amdgpu_kernel void @expect_align2() { -; CHECK-LABEL: @expect_align2() #0 !llvm.amdgcn.lds.kernel.id !3 +; CHECK-LABEL: @expect_align2() !llvm.amdgcn.lds.kernel.id !3 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds) ] ; CHECK-NEXT: call void @use_shared2() ; CHECK-NEXT: ret void @@ -131,7 +131,7 @@ } define amdgpu_kernel void @expect_align4() { -; CHECK-LABEL: @expect_align4() !llvm.amdgcn.lds.kernel.id !4 { +; CHECK-LABEL: @expect_align4() #1 !llvm.amdgcn.lds.kernel.id !4 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ] ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @use_shared4() @@ -141,9 +141,9 @@ ret void } -; Use dynamic_shared directly too. Can elide module lds (#0) +; Use dynamic_shared directly too. define amdgpu_kernel void @expect_align8() { -; CHECK-LABEL: @expect_align8() #0 !llvm.amdgcn.lds.kernel.id !5 { +; CHECK-LABEL: @expect_align8() !llvm.amdgcn.lds.kernel.id !5 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9 ; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4 @@ -158,7 +158,7 @@ ; Note: use_shared4 uses module.lds so this will allocate at offset 4 define amdgpu_kernel void @expect_max_of_2_and_4() { -; CHECK-LABEL: @expect_max_of_2_and_4() !llvm.amdgcn.lds.kernel.id !6 { +; CHECK-LABEL: @expect_max_of_2_and_4() #1 !llvm.amdgcn.lds.kernel.id !6 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ] ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @use_shared2() @@ -179,8 +179,8 @@ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) ; CHECK: declare i32 @llvm.amdgcn.lds.kernel.id() #3 -; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } -; CHECK: attributes #1 = { noinline } +; CHECK: attributes #0 = { noinline } +; CHECK: attributes #1 = { "amdgpu-lds-size"="4" } ; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -42,6 +42,3 @@ store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1 ret void } - -attributes #0 = { "amdgpu-elide-module-lds" } -; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -284,9 +284,11 @@ !2 = !{i32 1} -; OPT: attributes #0 = { "amdgpu-elide-module-lds" } -; OPT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } -; OPT: attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; OPT: attributes #0 = { "amdgpu-lds-size"="8" } +; OPT: attributes #1 = { "amdgpu-lds-size"="12" } +; OPT: attributes #2 = { "amdgpu-lds-size"="20" } +; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) } +; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; OPT: !0 = !{i64 0, i64 1} ; OPT: !1 = !{i64 4, i64 5} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -40,7 +40,7 @@ } ; This kernel calls a function that uses LDS so needs the block -; CHECK-LABEL: @kern_call() +; CHECK-LABEL: @kern_call() #0 ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK: call void @func() ; CHECK: %dec = atomicrmw fsub ptr addrspace(3) @llvm.amdgcn.module.lds, float 2.000000e+00 monotonic, align 8 @@ -53,7 +53,7 @@ ; This kernel does alloc the LDS block as it makes no calls ; CHECK-LABEL: @kern_empty() ; CHECK-NOT: call void @llvm.donothing() -define spir_kernel void @kern_empty() #0{ +define spir_kernel void @kern_empty() { ret void } @@ -61,5 +61,4 @@ ; declaration. declare amdgpu_kernel void @kernel_declaration() -attributes #0 = { "amdgpu-elide-module-lds" } -; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #0 = { "amdgpu-lds-size"="12" }