diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -30,6 +30,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUMemoryUtils.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" @@ -163,9 +164,10 @@ } bool runOnModule(Module &M) override { + CallGraph CG = CallGraph(M); UsedList = getUsedList(M); bool Changed = superAlignLDSGlobals(M); - Changed |= processUsedLDS(M); + Changed |= processUsedLDS(CG, M); for (Function &F : M.functions()) { if (F.isDeclaration()) @@ -174,7 +176,7 @@ // Only lower compute kernels' LDS. if (!AMDGPU::isKernel(F.getCallingConv())) continue; - Changed |= processUsedLDS(M, &F); + Changed |= processUsedLDS(CG, M, &F); } UsedList.clear(); @@ -226,7 +228,7 @@ return Changed; } - bool processUsedLDS(Module &M, Function *F = nullptr) { + bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -374,7 +376,20 @@ IRBuilder<> Builder(Ctx); for (Function &Func : M.functions()) { if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { - markUsedByKernel(Builder, &Func, SGV); + const CallGraphNode *N = CG[&Func]; + const bool CalleesRequireModuleLDS = N->size() > 0; + + if (CalleesRequireModuleLDS) { + // If a function this kernel might call requires module LDS, + // annotate the kernel to let later passes know it will allocate + // this structure, even if not apparent from the IR. + markUsedByKernel(Builder, &Func, SGV); + } else { + // However if we are certain this kernel cannot call a function that + // requires module LDS, annotate the kernel so the backend can elide + // the allocation without repeating callgraph walks. + Func.addFnAttr("amdgpu-elide-module-lds", "true"); + } } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -56,6 +56,9 @@ // Kernel may need limited waves per EU for better performance. bool WaveLimiter = false; + // This kernel calls no functions that require the module lds struct + bool CanElideModuleLDS = false; + public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -91,6 +94,8 @@ return WaveLimiter; } + bool canElideModuleLDS() const { return CanElideModuleLDS; } + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); void allocateModuleLDSGlobal(const Module *M); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -32,6 +32,9 @@ Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter"); WaveLimiter = WaveLimitAttr.getValueAsBool(); + const char *Attr = "amdgpu-elide-module-lds"; + CanElideModuleLDS = F.getFnAttribute(Attr).getValueAsBool(); + CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); @@ -64,7 +67,7 @@ void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) { if (isModuleEntryFunction()) { const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds"); - if (GV) { + if (GV && !canElideModuleLDS()) { unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV); (void)Offset; assert(Offset == 0 && diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -21,9 +21,8 @@ ; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2 ; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4 ;. -define amdgpu_kernel void @k0() { +define amdgpu_kernel void @k0() #0 { ; CHECK-LABEL: @k0( -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)* ; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8 ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)* @@ -49,9 +48,8 @@ ret void } -define amdgpu_kernel void @k1() { +define amdgpu_kernel void @k1() #0 { ; CHECK-LABEL: @k1( -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)* ; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4 ; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)* @@ -72,9 +70,8 @@ ret void } -define amdgpu_kernel void @0() { +define amdgpu_kernel void @0() #0 { ; CHECK-LABEL: @0( -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)* ; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 ; CHECK-NEXT: ret void @@ -85,9 +82,8 @@ ret void } -define amdgpu_kernel void @1() { +define amdgpu_kernel void @1() #0 { ; CHECK-LABEL: @1( -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)* ; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 ; CHECK-NEXT: ret void @@ -114,6 +110,6 @@ ret void } -;. -; CHECK: attributes #0 = { nofree nosync nounwind readnone willreturn } -;. + +attributes #0 = { "amdgpu-elide-module-lds"="true" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -35,8 +35,8 @@ ret void } -; CHECK-LABEL: @timestwo() -; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK-LABEL: @timestwo() #0 +; CHECK-NOT: call void @llvm.donothing() ; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* ; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32* ; CHECK: %3 = ptrtoint i32* %2 to i64 @@ -56,3 +56,6 @@ store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 ret void } + +attributes #0 = { "amdgpu-elide-module-lds"="true" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -4,7 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Check that module LDS is allocated at address 0 and kernel starts its -; allocation past module LDS. +; allocation past module LDS when a call is present. @lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 @@ -22,12 +22,14 @@ ; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1 ; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)* ; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16 +; OPT-NEXT: call void @f0() ; OPT-NEXT: ret void ; %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + call void @f0() ret void } @@ -36,7 +38,7 @@ ; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3 ; GCN: ds_write_b8 [[NULL]], [[TREE]] define void @f0() { -; OPT-LABEL: @f0( +; OPT-LABEL: @f0() { ; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)* ; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1 ; OPT-NEXT: ret void @@ -45,3 +47,35 @@ store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 ret void } + +; Without the function call, no module.lds.t at address zero is necessary so the +; two variables are allocated in unspecified order. This is a weakness in current +; codegen - the variable is moved into module.lds.t as a different function required +; that, then module.lds.t is allocated as a normal variable. Coincidentally codegen +; currently puts the higher alignment variable at zero. A later patch will avoid +; moving variables into module.lds.t in kernels where that is not required, leaving +; the variables to be optimally allocated as part of the kernel specific struct. +; GCN-LABEL: {{^}}k1: +; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: ds_write_b8 [[NULL]], [[ONE]] offset:16 +; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN: ds_write_b8 [[NULL]], [[TWO]] +define amdgpu_kernel void @k1() #0 { +; OPT-LABEL: @k1( +; OPT-NOT: call void @llvm.donothing() +; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)* +; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1 +; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K1_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K1_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)* +; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16 +; OPT-NEXT: ret void +; + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + ret void +} + +attributes #0 = { "amdgpu-elide-module-lds"="true" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -48,13 +48,16 @@ ret void } -; This kernel does not need to alloc the LDS block as it makes no calls +; This kernel does alloc the LDS block as it makes no calls ; CHECK-LABEL: @kern_empty() -; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] -define spir_kernel void @kern_empty() { +; CHECK-NOT: call void @llvm.donothing() +define spir_kernel void @kern_empty() #0{ ret void } ; Make sure we don't crash trying to insert code into a kernel ; declaration. declare amdgpu_kernel void @kernel_declaration() + +attributes #0 = { "amdgpu-elide-module-lds"="true" } +; CHECK: attributes #0 = { "amdgpu-elide-module-lds"="true" }