diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -498,7 +498,7 @@ const SITargetLowering &TLI = *getTLI(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateModuleLDSGlobal(F.getParent()); + Info->allocateModuleLDSGlobal(F); SmallVector ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -583,7 +583,7 @@ const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateModuleLDSGlobal(F.getParent()); + Info->allocateModuleLDSGlobal(F); SmallVector ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -30,6 +30,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUMemoryUtils.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" @@ -163,9 +164,10 @@ } bool runOnModule(Module &M) override { + CallGraph CG = CallGraph(M); UsedList = getUsedList(M); bool Changed = superAlignLDSGlobals(M); - Changed |= processUsedLDS(M); + Changed |= processUsedLDS(CG, M); for (Function &F : M.functions()) { if (F.isDeclaration()) @@ -174,7 +176,7 @@ // Only lower compute kernels' LDS. if (!AMDGPU::isKernel(F.getCallingConv())) continue; - Changed |= processUsedLDS(M, &F); + Changed |= processUsedLDS(CG, M, &F); } UsedList.clear(); @@ -226,7 +228,7 @@ return Changed; } - bool processUsedLDS(Module &M, Function *F = nullptr) { + bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -374,7 +376,20 @@ IRBuilder<> Builder(Ctx); for (Function &Func : M.functions()) { if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { - markUsedByKernel(Builder, &Func, SGV); + const CallGraphNode *N = CG[&Func]; + const bool CalleesRequireModuleLDS = N->size() > 0; + + if (CalleesRequireModuleLDS) { + // If a function this kernel might call requires module LDS, + // annotate the kernel to let later passes know it will allocate + // this structure, even if not apparent from the IR. + markUsedByKernel(Builder, &Func, SGV); + } else { + // However if we are certain this kernel cannot call a function that + // requires module LDS, annotate the kernel so the backend can elide + // the allocation without repeating callgraph walks. + Func.addFnAttr("amdgpu-elide-module-lds"); + } } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -99,7 +99,7 @@ } unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); - void allocateModuleLDSGlobal(const Module *M); + void allocateModuleLDSGlobal(const Function &F); Align getDynLDSAlign() const { return DynLDSAlign; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -82,10 +82,16 @@ return Offset; } -void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) { +// This kernel calls no functions that require the module lds struct +static bool canElideModuleLDS(const Function &F) { + return F.hasFnAttribute("amdgpu-elide-module-lds"); +} + +void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) { + const Module *M = F.getParent(); if (isModuleEntryFunction()) { const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds"); - if (GV) { + if (GV && !canElideModuleLDS(F)) { unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV); (void)Offset; assert(Offset == 0 && diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2447,7 +2447,7 @@ return DAG.getEntryNode(); } - Info->allocateModuleLDSGlobal(Fn.getParent()); + Info->allocateModuleLDSGlobal(Fn); SmallVector Splits; SmallVector ArgLocs; diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -21,9 +21,8 @@ ; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2 ; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4 ;. -define amdgpu_kernel void @k0() { +define amdgpu_kernel void @k0() #0 { ; CHECK-LABEL: @k0( -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)* ; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8 ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)* @@ -49,9 +48,8 @@ ret void } -define amdgpu_kernel void @k1() { +define amdgpu_kernel void @k1() #0 { ; CHECK-LABEL: @k1( -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)* ; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4 ; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)* @@ -72,9 +70,8 @@ ret void } -define amdgpu_kernel void @0() { +define amdgpu_kernel void @0() #0 { ; CHECK-LABEL: @0( -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)* ; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 ; CHECK-NEXT: ret void @@ -85,9 +82,8 @@ ret void } -define amdgpu_kernel void @1() { +define amdgpu_kernel void @1() #0 { ; CHECK-LABEL: @1( -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)* ; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 ; CHECK-NEXT: ret void @@ -114,6 +110,6 @@ ret void } -;. -; CHECK: attributes #0 = { nocallback nofree nosync nounwind readnone willreturn } -;. + +attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -35,8 +35,8 @@ ret void } -; CHECK-LABEL: @timestwo() -; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK-LABEL: @timestwo() #0 +; CHECK-NOT: call void @llvm.donothing() ; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* ; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32* ; CHECK: %3 = ptrtoint i32* %2 to i64 @@ -56,3 +56,6 @@ store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 ret void } + +attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -4,7 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Check that module LDS is allocated at address 0 and kernel starts its -; allocation past module LDS. +; allocation past module LDS when a call is present. @lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 @@ -22,12 +22,14 @@ ; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1 ; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)* ; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16 +; OPT-NEXT: call void @f0() ; OPT-NEXT: ret void ; %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + call void @f0() ret void } @@ -36,7 +38,7 @@ ; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3 ; GCN: ds_write_b8 [[NULL]], [[TREE]] define void @f0() { -; OPT-LABEL: @f0( +; OPT-LABEL: @f0() { ; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)* ; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1 ; OPT-NEXT: ret void @@ -45,3 +47,6 @@ store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 ret void } + +attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -48,13 +48,16 @@ ret void } -; This kernel does not need to alloc the LDS block as it makes no calls +; This kernel does alloc the LDS block as it makes no calls ; CHECK-LABEL: @kern_empty() -; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] -define spir_kernel void @kern_empty() { +; CHECK-NOT: call void @llvm.donothing() +define spir_kernel void @kern_empty() #0{ ret void } ; Make sure we don't crash trying to insert code into a kernel ; declaration. declare amdgpu_kernel void @kernel_declaration() + +attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }