diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -902,8 +902,6 @@ Function *F = I->getFunction(); return F == &Func; }); - - markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); } } @@ -972,14 +970,6 @@ auto Replacement = createLDSVariableReplacement(M, VarName, KernelUsedVariables); - // If any indirect uses, create a direct use to ensure allocation - // TODO: Simpler to unconditionally mark used but that regresses - // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll - auto Accesses = LDSUsesInfo.indirect_access.find(&Func); - if ((Accesses != LDSUsesInfo.indirect_access.end()) && - !Accesses->second.empty()) - markUsedByKernel(Builder, &Func, Replacement.SGV); - // remove preserves existing codegen removeLocalVarsFromUsedLists(M, KernelUsedVariables); KernelToReplacement[&Func] = Replacement; @@ -1067,6 +1057,7 @@ KernelToCreatedDynamicLDS[func] = N; + // Could replace this with a dynamic LDS alignment attribute markUsedByKernel(Builder, func, N); auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -945,22 +945,28 @@ AllocatedSizes.emplace_back(AllocSize, Alignment); } - // Sort to try to estimate the worst case alignment padding - // - // FIXME: We should really do something to fix the addresses to a more optimal - // value instead - llvm::sort(AllocatedSizes, llvm::less_second()); - // Check how much local memory is being used by global objects CurrentLocalMemUsage = 0; - // FIXME: Try to account for padding here. The real padding and address is - // currently determined from the inverse order of uses in the function when - // legalizing, which could also potentially change. We try to estimate the - // worst case here, but we probably should fix the addresses earlier. - for (auto Alloc : AllocatedSizes) { - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second); - CurrentLocalMemUsage += Alloc.first; + // If the kernel has an amdgpu-lds-size attribute, use that value instead of + // estimating. + CurrentLocalMemUsage = F.getFnAttributeAsParsedInteger("amdgpu-lds-size", 0); + + if (CurrentLocalMemUsage == 0) { + // Sort to try to estimate the worst case alignment padding + // + // FIXME: We should really do something to fix the addresses to a more optimal + // value instead + llvm::sort(AllocatedSizes, llvm::less_second()); + + // FIXME: Try to account for padding here. The real padding and address is + // currently determined from the inverse order of uses in the function when + // legalizing, which could also potentially change. We try to estimate the + // worst case here, but we probably should fix the addresses earlier. + for (auto Alloc : AllocatedSizes) { + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second); + CurrentLocalMemUsage += Alloc.first; + } } unsigned MaxOccupancy = diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll @@ -10,7 +10,6 @@ define amdgpu_kernel void @kernel_0() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0() #0 !llvm.amdgcn.lds.kernel.id !1 { -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_0.lds) ] ; CHECK-NEXT: call void @call_store_A() ; CHECK-NEXT: ret void ; @@ -30,7 +29,6 @@ define amdgpu_kernel void @kernel_2() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2() #0 !llvm.amdgcn.lds.kernel.id !3 { -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_2.lds) ] ; CHECK-NEXT: call void @store_A() ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll @@ -33,7 +33,7 @@ define amdgpu_kernel void @kernel_only() { -; CHECK-LABEL: @kernel_only() { +; CHECK-LABEL: @kernel_only( ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0 ; CHECK-NEXT: store double 3.140000e+00, ptr addrspace(3) [[ARRAYIDX]], align 8 ; CHECK-NEXT: ret void @@ -44,8 +44,8 @@ } ; The accesses from functions are rewritten to go through the llvm.amdgcn.dynlds.offset.table -define void @use_shared1() { -; CHECK-LABEL: @use_shared1() { +define void @use_shared1() #0 { +; CHECK-LABEL: @use_shared1() #0 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[DYNAMIC_SHARED1:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED1]], align 4 @@ -60,7 +60,7 @@ } define void @use_shared2() #0 { -; CHECK-LABEL: @use_shared2() #0 { +; CHECK-LABEL: @use_shared2() #0 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[DYNAMIC_SHARED2:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED2]], align 4 @@ -77,7 +77,7 @@ ; Include a normal variable so that the new variables aren't all at the same absolute_symbol @static_shared = addrspace(3) global i32 undef define void @use_shared4() #0 { -; CHECK-LABEL: @use_shared4() #0 { +; CHECK-LABEL: @use_shared4() #0 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: store i32 4, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 ; CHECK-NEXT: [[DYNAMIC_SHARED4:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] @@ -94,7 +94,7 @@ } define void @use_shared8() #0 { -; CHECK-LABEL: @use_shared8() #0 { +; CHECK-LABEL: @use_shared8() #0 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[DYNAMIC_SHARED8:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4 @@ -130,9 +130,8 @@ } define amdgpu_kernel void @expect_align4() { -; CHECK-LABEL: @expect_align4() #1 !llvm.amdgcn.lds.kernel.id !4 { +; CHECK-LABEL: @expect_align4() #1 !llvm.amdgcn.lds.kernel.id !4 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ] -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @use_shared4() ; CHECK-NEXT: ret void ; @@ -140,9 +139,9 @@ ret void } -; Use dynamic_shared directly too. +; Use dynamic_shared directly too. Can elide module lds (#0) define amdgpu_kernel void @expect_align8() { -; CHECK-LABEL: @expect_align8() !llvm.amdgcn.lds.kernel.id !5 { +; CHECK-LABEL: @expect_align8() !llvm.amdgcn.lds.kernel.id !5 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9 ; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4 @@ -157,9 +156,8 @@ ; Note: use_shared4 uses module.lds so this will allocate at offset 4 define amdgpu_kernel void @expect_max_of_2_and_4() { -; CHECK-LABEL: @expect_max_of_2_and_4() #1 !llvm.amdgcn.lds.kernel.id !6 { +; CHECK-LABEL: @expect_max_of_2_and_4() #1 !llvm.amdgcn.lds.kernel.id !6 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ] -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @use_shared2() ; CHECK-NEXT: call void @use_shared4() ; CHECK-NEXT: ret void @@ -181,7 +179,7 @@ ; CHECK: attributes #0 = { noinline } ; CHECK: attributes #1 = { "amdgpu-lds-size"="4" } ; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) } -; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: !0 = !{i64 0, i64 1} ; CHECK: !1 = !{i64 4, i64 5} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -17,8 +17,6 @@ ; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16 define amdgpu_kernel void @k0() { ; OPT-LABEL: @k0( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ] -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: store i8 1, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1 ; OPT-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16 ; OPT-NEXT: call void @f0() diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll @@ -68,12 +68,10 @@ define amdgpu_kernel void @k0_f0() { ; M_OR_HY-LABEL: @k0_f0( -; M_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; M_OR_HY-NEXT: call void @f0() ; M_OR_HY-NEXT: ret void ; ; TABLE-LABEL: @k0_f0( -; TABLE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_f0.lds) ] ; TABLE-NEXT: call void @f0() ; TABLE-NEXT: ret void ; @@ -83,12 +81,10 @@ define amdgpu_kernel void @k1_f0() { ; M_OR_HY-LABEL: @k1_f0( -; M_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; M_OR_HY-NEXT: call void @f0() ; M_OR_HY-NEXT: ret void ; ; TABLE-LABEL: @k1_f0( -; TABLE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds) ] ; TABLE-NEXT: call void @f0() ; TABLE-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll @@ -59,20 +59,9 @@ } define amdgpu_kernel void @k_f0() { -; MODULE-LABEL: @k_f0( -; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META1]] -; MODULE-NEXT: call void @f0() -; MODULE-NEXT: ret void -; -; TABLE-LABEL: @k_f0( -; TABLE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k_f0.lds) ] -; TABLE-NEXT: call void @f0() -; TABLE-NEXT: ret void -; -; K_OR_HY-LABEL: @k_f0( -; K_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k_f0.lds) ] -; K_OR_HY-NEXT: call void @f0() -; K_OR_HY-NEXT: ret void +; CHECK-LABEL: @k_f0( +; CHECK-NEXT: call void @f0() +; CHECK-NEXT: ret void ; call void @f0() ret void @@ -83,7 +72,7 @@ @both.lds = addrspace(3) global i32 undef define void @f_both() { ; MODULE-LABEL: @f_both( -; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] +; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META4]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 4 ; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]] ; MODULE-NEXT: ret void @@ -115,7 +104,6 @@ define amdgpu_kernel void @k0_both() { ; MODULE-LABEL: @k0_both( -; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] ; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 ; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]] @@ -123,7 +111,6 @@ ; MODULE-NEXT: ret void ; ; TABLE-LABEL: @k0_both( -; TABLE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds) ] ; TABLE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 ; TABLE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 ; TABLE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 @@ -131,7 +118,6 @@ ; TABLE-NEXT: ret void ; ; K_OR_HY-LABEL: @k0_both( -; K_OR_HY-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds) ] ; K_OR_HY-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 ; K_OR_HY-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5 ; K_OR_HY-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s @@ -158,8 +158,6 @@ ; Access two variables, will allocate those two define amdgpu_kernel void @k01() { ; OPT-LABEL: @k01( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ] -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f0() ; OPT-NEXT: call void @f1() ; OPT-NEXT: ret void @@ -193,7 +191,6 @@ define amdgpu_kernel void @k23() { ; OPT-LABEL: @k23( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() ; OPT-NEXT: ret void @@ -231,12 +228,10 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { ; OPT-LABEL: @k123( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f1() -; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]] +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] ; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 -; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]] +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META5]], !noalias [[META8]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ; @@ -287,24 +282,18 @@ ; OPT: attributes #0 = { "amdgpu-lds-size"="8" } ; OPT: attributes #1 = { "amdgpu-lds-size"="12" } ; OPT: attributes #2 = { "amdgpu-lds-size"="20" } -; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) } -; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; OPT: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; OPT: !0 = !{i64 0, i64 1} ; OPT: !1 = !{i64 4, i64 5} ; OPT: !2 = !{i64 8, i64 9} ; OPT: !3 = !{i32 1} -; OPT: !4 = !{!5} -; OPT: !5 = distinct !{!5, !6} -; OPT: !6 = distinct !{!6} -; OPT: !7 = !{!8} -; OPT: !8 = distinct !{!8, !6} -; OPT: !9 = !{i32 0} -; OPT: !10 = !{!11} -; OPT: !11 = distinct !{!11, !12} -; OPT: !12 = distinct !{!12} -; OPT: !13 = !{!14} -; OPT: !14 = distinct !{!14, !12} +; OPT: !4 = !{i32 0} +; OPT: !5 = !{!6} +; OPT: !6 = distinct !{!6, !7} +; OPT: !7 = distinct !{!7} +; OPT: !8 = !{!9} +; OPT: !9 = distinct !{!9, !7} ; Table size length number-kernels * number-variables * sizeof(uint16_t) ; GCN: .type llvm.amdgcn.lds.offset.table,@object diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=GCN %s @@ -219,7 +220,6 @@ ; Access two variables, will allocate those two define amdgpu_kernel void @k01() { ; OPT-LABEL: @k01() #0 !llvm.amdgcn.lds.kernel.id !1 { -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ] ; OPT-NEXT: call void @f0() ; OPT-NEXT: call void @f1() ; OPT-NEXT: ret void @@ -256,8 +256,7 @@ } define amdgpu_kernel void @k23() { -; OPT-LABEL: @k23() #1 !llvm.amdgcn.lds.kernel.id !7 { -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ] +; OPT-LABEL: @k23() #1 !llvm.amdgcn.lds.kernel.id !2 { ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() ; OPT-NEXT: ret void @@ -295,12 +294,11 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { -; OPT-LABEL: @k123() #2 !llvm.amdgcn.lds.kernel.id !13 { -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ] +; OPT-LABEL: @k123() #2 !llvm.amdgcn.lds.kernel.id !3 { ; OPT-NEXT: call void @f1() -; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21 +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] ; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 -; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21 +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope [[META4]], !noalias [[META7]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -41,7 +41,6 @@ ; This kernel calls a function that uses LDS so needs the block ; CHECK-LABEL: @kern_call() #0 -; CHECK: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK: call void @func() ; CHECK: %dec = atomicrmw fsub ptr addrspace(3) @llvm.amdgcn.module.lds, float 2.000000e+00 monotonic, align 8 define amdgpu_kernel void @kern_call() { @@ -51,9 +50,8 @@ } ; This kernel does alloc the LDS block as it makes no calls -; CHECK-LABEL: @kern_empty() -; CHECK-NOT: call void @llvm.donothing() -define spir_kernel void @kern_empty() { +; CHECK-LABEL: @kern_empty() #1 +define spir_kernel void @kern_empty() #0{ ret void } @@ -61,4 +59,7 @@ ; declaration. declare amdgpu_kernel void @kernel_declaration() +attributes #0 = { "amdgpu-elide-module-lds" } + ; CHECK: attributes #0 = { "amdgpu-lds-size"="12" } +; CHECK: attributes #1 = { "amdgpu-elide-module-lds" }