diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1090,6 +1090,12 @@ kernel argument that holds the completion action pointer. If this attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed. + "amdgpu-lds-size" The number of bytes that will be allocated in the Local Data Store at + address zero. Variables are allocated within this frame using absolute + symbol metadata, primarily by the AMDGPULowerModuleLDS pass. Internal + detail of how LDS variables are lowered, language front ends should not + set this. + ======================================= ========================================================== Calling Conventions diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -512,8 +512,6 @@ const SITargetLowering &TLI = *getTLI(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateKnownAddressLDSGlobal(F); - SmallVector ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -596,8 +594,6 @@ const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateKnownAddressLDSGlobal(F); - SmallVector ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -1106,6 +1106,8 @@ return KernelToCreatedDynamicLDS; } + // This attribute is no longer used by the backend. TODO: Delete it in favour + // of pass-local state and update the tests to remove the string. static bool canElideModuleLDS(const Function &F) { return F.hasFnAttribute("amdgpu-elide-module-lds"); } @@ -1211,7 +1213,6 @@ // All kernel frames have been allocated. Calculate and record the // addresses. - { const DataLayout &DL = M.getDataLayout(); @@ -1220,8 +1221,8 @@ continue; // All three of these are optional. The first variable is allocated at - // zero. They are allocated by allocateKnownAddressLDSGlobal in the - // following order: + // zero. They are allocated by AMDGPUMachineFunction as one block. + // Layout: //{ // module.lds // alignment padding @@ -1250,22 +1251,23 @@ if (AllocateKernelScopeStruct) { GlobalVariable *KernelStruct = Replacement->second.SGV; - Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct)); - recordLDSAbsoluteAddress(&M, KernelStruct, Offset); - Offset += DL.getTypeAllocSize(KernelStruct->getValueType()); - } + // If there is dynamic allocation, the alignment needed is included in + // the static frame size. There may be no reference to the dynamic + // variable in the kernel itself, so without including it here, that + // alignment padding could be missed. if (AllocateDynamicVariable) { GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; - Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable)); - recordLDSAbsoluteAddress(&M, DynamicVariable, Offset); } + + if (Offset != 0) + Func.addFnAttr("amdgpu-lds-size", std::to_string(Offset)); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -104,8 +104,6 @@ unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV, Align Trailing); - void allocateKnownAddressLDSGlobal(const Function &F); - static std::optional getLDSKernelIdMetadata(const Function &F); static std::optional getLDSAbsoluteAddress(const GlobalValue &GV); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -43,6 +43,12 @@ // Assume the attribute allocates before any known GDS globals. StaticGDSSize = GDSSize; + // The two separate variables are only profitable when the LDS module lowering + // pass is disabled. If graphics does not use dynamic LDS, this is never + // profitable. Leaving cleanup for a later change. + LDSSize = F.getFnAttributeAsParsedInteger("amdgpu-lds-size", 0); + StaticLDSSize = LDSSize; + CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); @@ -65,6 +71,42 @@ unsigned Offset; if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + + std::optional MaybeAbs = getLDSAbsoluteAddress(GV); + if (MaybeAbs) { + // Absolute address LDS variables that exist prior to the LDS lowering + // pass raise a fatal error in that pass. These failure modes are only + // reachable if that lowering pass is disabled or broken. If/when adding + // support for absolute addresses on user specified variables, the + // alignment check moves to the lowering pass and the frame calculation + // needs to take the user variables into consideration. + + uint32_t ObjectStart = *MaybeAbs; + + if (ObjectStart != alignTo(ObjectStart, Alignment)) { + report_fatal_error("Absolute address LDS variable inconsistent with " + "variable alignment"); + } + + if (isModuleEntryFunction()) { + // If this is a module entry function, we can also sanity check against + // the static frame. Strictly it would be better to check against the + // attribute, i.e. that the variable is within the always-allocated + // section, and not within some other non-absolute-address object + // allocated here, but the extra error detection is minimal and we would + // have to pass the Function around or cache the attribute value. + uint32_t ObjectEnd = + ObjectStart + DL.getTypeAllocSize(GV.getValueType()); + if (ObjectEnd > StaticLDSSize) { + report_fatal_error( + "Absolute address LDS variable outside of static frame"); + } + } + + Entry.first->second = ObjectStart; + return ObjectStart; + } + /// TODO: We should sort these to minimize wasted space due to alignment /// padding. Currently the padding is decided by the first encountered use /// during lowering. @@ -89,16 +131,6 @@ return Offset; } -static constexpr StringLiteral ModuleLDSName = "llvm.amdgcn.module.lds"; - -static const GlobalVariable *getKernelLDSGlobalFromFunction(const Function &F) { - const Module *M = F.getParent(); - std::string KernelLDSName = "llvm.amdgcn.kernel."; - KernelLDSName += F.getName(); - KernelLDSName += ".lds"; - return M->getNamedGlobal(KernelLDSName); -} - static const GlobalVariable * getKernelDynLDSGlobalFromFunction(const Function &F) { const Module *M = F.getParent(); @@ -108,73 +140,6 @@ return M->getNamedGlobal(KernelDynLDSName); } -// This kernel calls no functions that require the module lds struct -static bool canElideModuleLDS(const Function &F) { - return F.hasFnAttribute("amdgpu-elide-module-lds"); -} - -void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) { - const Module *M = F.getParent(); - // This function is called before allocating any other LDS so that it can - // reliably put values at known addresses. Consequently, dynamic LDS, if - // present, will not yet have been allocated - - assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated"); - - if (isModuleEntryFunction()) { - - // Pointer values start from zero, memory allocated per-kernel-launch - // Variables can be grouped into a module level struct and a struct per - // kernel function by AMDGPULowerModuleLDSPass. If that is done, they - // are allocated at statically computable addresses here. - // - // Address 0 - // { - // llvm.amdgcn.module.lds - // } - // alignment padding - // { - // llvm.amdgcn.kernel.some-name.lds - // } - // other variables, e.g. dynamic lds, allocated after this call - - const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName); - const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F); - const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F); - - if (GV && !canElideModuleLDS(F)) { - unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align()); - std::optional Expect = getLDSAbsoluteAddress(*GV); - if (!Expect || (Offset != *Expect)) { - report_fatal_error("Inconsistent metadata on module LDS variable"); - } - } - - if (KV) { - // The per-kernel offset is deterministic because it is allocated - // before any other non-module LDS variables. - unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align()); - std::optional Expect = getLDSAbsoluteAddress(*KV); - if (!Expect || (Offset != *Expect)) { - report_fatal_error("Inconsistent metadata on kernel LDS variable"); - } - } - - if (Dyn) { - // The dynamic LDS is deterministic because the per-kernel one has the - // maximum alignment of any reachable and all remaining LDS variables, - // if this is present, are themselves dynamic LDS and will be allocated - // at the same address. - setDynLDSAlign(F, *Dyn); - unsigned Offset = LDSSize; - std::optional Expect = getLDSAbsoluteAddress(*Dyn); - if (!Expect || (Offset != *Expect)) { - report_fatal_error("Inconsistent metadata on dynamic LDS variable"); - } - } - } -} - std::optional AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) { // TODO: Would be more consistent with the abs symbols to use a range diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2460,8 +2460,6 @@ return DAG.getEntryNode(); } - Info->allocateKnownAddressLDSGlobal(Fn); - SmallVector Splits; SmallVector ArgLocs; BitVector Skipped(Ins.size()); diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -22,7 +22,7 @@ ; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 4, !absolute_symbol !0 ;. define amdgpu_kernel void @k0() #0 { -; CHECK-LABEL: @k0( +; CHECK-LABEL: @k0() #0 ; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !1, !noalias !4 ; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !8, !noalias !9 ; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !10, !noalias !11 @@ -40,7 +40,7 @@ } define amdgpu_kernel void @k1() #0 { -; CHECK-LABEL: @k1( +; CHECK-LABEL: @k1() #1 ; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !14, !noalias !17 ; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !20, !noalias !21 ; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !22, !noalias !23 @@ -56,7 +56,7 @@ } define amdgpu_kernel void @k2() #0 { -; CHECK-LABEL: @k2( +; CHECK-LABEL: @k2() #2 ; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 2 ; CHECK-NEXT: ret void ; @@ -66,7 +66,7 @@ } define amdgpu_kernel void @k3() #0 { -; CHECK-LABEL: @k3( +; CHECK-LABEL: @k3() #3 ; CHECK-NEXT: store i8 4, ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, align 4 ; CHECK-NEXT: ret void ; @@ -75,14 +75,14 @@ ret void } - +; CHECK-LABEL: @calls_f0() #4 define amdgpu_kernel void @calls_f0() { call void @f0() ret void } define void @f0() { -; CHECK-LABEL: define void @f0( +; CHECK-LABEL: define void @f0() ; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 8, !noalias !24 ; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !24 ; CHECK-NEXT: ret void @@ -93,7 +93,10 @@ ret void } -attributes #0 = { "amdgpu-elide-module-lds" } -; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="23" } +; CHECK: attributes #1 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="22" } +; CHECK: attributes #2 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="2" } +; CHECK: attributes #3 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="4" } +; CHECK: attributes #4 = { "amdgpu-lds-size"="9" } ; CHECK: !0 = !{i64 0, i64 1} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll @@ -9,7 +9,7 @@ @B = external addrspace(3) global [0 x i32] define amdgpu_kernel void @kernel_0() { -; CHECK-LABEL: define amdgpu_kernel void @kernel_0() !llvm.amdgcn.lds.kernel.id !1 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_0() #0 !llvm.amdgcn.lds.kernel.id !1 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_0.lds) ] ; CHECK-NEXT: call void @call_store_A() ; CHECK-NEXT: ret void @@ -29,7 +29,7 @@ } define amdgpu_kernel void @kernel_2() { -; CHECK-LABEL: define amdgpu_kernel void @kernel_2() !llvm.amdgcn.lds.kernel.id !3 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_2() #0 !llvm.amdgcn.lds.kernel.id !3 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_2.lds) ] ; CHECK-NEXT: call void @store_A() ; CHECK-NEXT: ret void @@ -82,3 +82,5 @@ ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) } + +; CHECK: attributes #0 = { "amdgpu-lds-size"="64" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -48,7 +48,7 @@ ret void } -; CHECK-LABEL: @timestwo() #0 +; CHECK-LABEL: @timestwo() #1 ; CHECK-NOT: call void @llvm.donothing() ; CHECK: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds to ptr @@ -67,14 +67,14 @@ ; CHECK: %12 = inttoptr i64 %11 to ptr ; CHECK: store i32 %mul, ptr %12, align 4 ; CHECK: ret void -define amdgpu_kernel void @timestwo() { +define amdgpu_kernel void @timestwo() #1 { %ld = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @kern to ptr) to i64)) to ptr), align 4 %mul = mul i32 %ld, 2 store i32 %mul, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64)) to ptr), align 4 ret void } -; CHECK-LABEL: @through_functions() +; CHECK-LABEL: @through_functions() #2 define amdgpu_kernel void @through_functions() { %ld = call i32 @get_func() %mul = mul i32 %ld, 4 @@ -84,3 +84,5 @@ attributes #0 = { "amdgpu-elide-module-lds" } ; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #1 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="8" } +; CHECK: attributes #2 = { "amdgpu-lds-size"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll @@ -131,7 +131,7 @@ } define amdgpu_kernel void @expect_align4() { -; CHECK-LABEL: @expect_align4() !llvm.amdgcn.lds.kernel.id !4 { +; CHECK-LABEL: @expect_align4() #2 !llvm.amdgcn.lds.kernel.id !4 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ] ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @use_shared4() @@ -158,7 +158,7 @@ ; Note: use_shared4 uses module.lds so this will allocate at offset 4 define amdgpu_kernel void @expect_max_of_2_and_4() { -; CHECK-LABEL: @expect_max_of_2_and_4() !llvm.amdgcn.lds.kernel.id !6 { +; CHECK-LABEL: @expect_max_of_2_and_4() #2 !llvm.amdgcn.lds.kernel.id !6 { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ] ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @use_shared2() @@ -174,15 +174,16 @@ attributes #0 = { noinline } ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -; CHECK: declare void @llvm.donothing() #2 +; CHECK: declare void @llvm.donothing() #3 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -; CHECK: declare i32 @llvm.amdgcn.lds.kernel.id() #3 +; CHECK: declare i32 @llvm.amdgcn.lds.kernel.id() #4 ; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } ; CHECK: attributes #1 = { noinline } -; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) } -; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #2 = { "amdgpu-lds-size"="4" } +; CHECK: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: !0 = !{i64 0, i64 1} ; CHECK: !1 = !{i64 4, i64 5} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -284,9 +284,12 @@ !2 = !{i32 1} -; OPT: attributes #0 = { "amdgpu-elide-module-lds" } -; OPT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } -; OPT: attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; OPT: attributes #0 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="8" } +; OPT: attributes #1 = { "amdgpu-lds-size"="8" } +; OPT: attributes #2 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="12" } +; OPT: attributes #3 = { "amdgpu-lds-size"="20" } +; OPT: attributes #4 = { nocallback nofree nosync nounwind willreturn memory(none) } +; OPT: attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; OPT: !0 = !{i64 0, i64 1} ; OPT: !1 = !{i64 4, i64 5} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -195,7 +195,7 @@ ; Doesn't access any via a function, won't be in the lookup table define amdgpu_kernel void @kernel_no_table() { -; OPT-LABEL: @kernel_no_table() { +; OPT-LABEL: @kernel_no_table() #0 { ; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 ; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8 ; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 @@ -218,7 +218,7 @@ ; Access two variables, will allocate those two define amdgpu_kernel void @k01() { -; OPT-LABEL: @k01() !llvm.amdgcn.lds.kernel.id !1 { +; OPT-LABEL: @k01() #0 !llvm.amdgcn.lds.kernel.id !1 { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ] ; OPT-NEXT: call void @f0() ; OPT-NEXT: call void @f1() @@ -256,7 +256,7 @@ } define amdgpu_kernel void @k23() { -; OPT-LABEL: @k23() !llvm.amdgcn.lds.kernel.id !7 { +; OPT-LABEL: @k23() #1 !llvm.amdgcn.lds.kernel.id !7 { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ] ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() @@ -295,7 +295,7 @@ ; Access and allocate three variables define amdgpu_kernel void @k123() { -; OPT-LABEL: @k123() !llvm.amdgcn.lds.kernel.id !13 { +; OPT-LABEL: @k123() #2 !llvm.amdgcn.lds.kernel.id !13 { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ] ; OPT-NEXT: call void @f1() ; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21 @@ -346,6 +346,10 @@ ; OPT: declare i32 @llvm.amdgcn.lds.kernel.id() +; OPT: attributes #0 = { "amdgpu-lds-size"="8" } +; OPT: attributes #1 = { "amdgpu-lds-size"="12" } +; OPT: attributes #2 = { "amdgpu-lds-size"="16" } + !0 = !{i64 0, i64 1} !1 = !{i32 0} !2 = !{i32 2} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -40,7 +40,7 @@ } ; This kernel calls a function that uses LDS so needs the block -; CHECK-LABEL: @kern_call() +; CHECK-LABEL: @kern_call() #0 ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK: call void @func() ; CHECK: %dec = atomicrmw fsub ptr addrspace(3) @llvm.amdgcn.module.lds, float 2.000000e+00 monotonic, align 8 @@ -51,7 +51,7 @@ } ; This kernel does alloc the LDS block as it makes no calls -; CHECK-LABEL: @kern_empty() +; CHECK-LABEL: @kern_empty() #1 ; CHECK-NOT: call void @llvm.donothing() define spir_kernel void @kern_empty() #0{ ret void @@ -62,4 +62,6 @@ declare amdgpu_kernel void @kernel_declaration() attributes #0 = { "amdgpu-elide-module-lds" } -; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } + +; CHECK: attributes #0 = { "amdgpu-lds-size"="12" } +; CHECK: attributes #1 = { "amdgpu-elide-module-lds" }