diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -498,7 +498,7 @@ const SITargetLowering &TLI = *getTLI(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateModuleLDSGlobal(F); + Info->allocateKnownAddressLDSGlobal(F); SmallVector ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -582,7 +582,7 @@ const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateModuleLDSGlobal(F); + Info->allocateKnownAddressLDSGlobal(F); SmallVector ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -102,8 +102,18 @@ return WaveLimiter; } - unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); - void allocateModuleLDSGlobal(const Function &F); + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) { + return allocateLDSGlobal(DL, GV, DynLDSAlign); + } + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV, + Align Trailing); + + void allocateKnownAddressLDSGlobal(const Function &F); + + // A kernel function may have an associated LDS allocation, and a kernel-scope + // LDS allocation must have an associated kernel function + static const GlobalVariable * + getKernelLDSGlobalFromFunction(const Function &F); static Optional getLDSKernelIdMetadata(const Function &F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -49,7 +49,8 @@ } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, - const GlobalVariable &GV) { + const GlobalVariable &GV, + Align Trailing) { auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0)); if (!Entry.second) return Entry.first->second; @@ -66,9 +67,8 @@ StaticLDSSize += DL.getTypeAllocSize(GV.getValueType()); - // Update the LDS size considering the padding to align the dynamic shared - // memory. - LDSSize = alignTo(StaticLDSSize, DynLDSAlign); + // Align LDS size to trailing, e.g. for aligning dynamic shared memory + LDSSize = alignTo(StaticLDSSize, Trailing); } else { assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS && "expected region address space"); @@ -84,21 +84,62 @@ return Offset; } +const GlobalVariable * +AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) { + const Module *M = F.getParent(); + std::string KernelLDSName = "llvm.amdgcn.kernel."; + KernelLDSName += F.getName(); + KernelLDSName += ".lds"; + return M->getNamedGlobal(KernelLDSName); +} + // This kernel calls no functions that require the module lds struct static bool canElideModuleLDS(const Function &F) { return F.hasFnAttribute("amdgpu-elide-module-lds"); } -void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) { +void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) { const Module *M = F.getParent(); + + // This function is called before allocating any other LDS so that it can + // reliably put values at known addresses. Consequently, dynamic LDS, if + // present, will not yet have been allocated + + assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated"); + if (isModuleEntryFunction()) { + + // Pointer values start from zero, memory allocated per-kernel-launch + // Variables can be grouped into a module level struct and a struct per + // kernel function by AMDGPULowerModuleLDSPass. If that is done, they + // are allocated at statically computable addresses here. + // + // Address 0 + // { + // llvm.amdgcn.module.lds + // } + // alignment padding + // { + // llvm.amdgcn.kernel.some-name.lds + // } + // other variables, e.g. dynamic lds, allocated after this call + const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds"); + const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F); + if (GV && !canElideModuleLDS(F)) { - unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV); + unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align()); (void)Offset; assert(Offset == 0 && "Module LDS expected to be allocated before other LDS"); } + + if (KV) { + // The per-kernel offset is deterministic because it is allocated + // before any other non-module LDS variables. + unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align()); + (void)Offset; + } } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2352,7 +2352,7 @@ return DAG.getEntryNode(); } - Info->allocateModuleLDSGlobal(Fn); + Info->allocateKnownAddressLDSGlobal(Fn); SmallVector Splits; SmallVector ArgLocs; diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -0,0 +1,272 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s + +; LDS is allocated per-kernel. Module scope variables are gathered into a struct which is +; allocated at address zero, if used by the kernel. Kernel scope variables are gathered into +; a per-kernel struct and allocated immediately after the module scope. +; This test checks that the module and kernel scope variables are allocated in deterministic +; order without spurious alignment padding between the two + +; External LDS is checked because it influences LDS padding in general and because it will +; not be moved into either module or kernel struct + +@module_variable = addrspace(3) global i16 undef + +; Variables are allocated into module scope block when used by a non-kernel function +define void @use_module() #0 { +; CHECK-LABEL: use_module: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ds_write_b16 v0, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + store i16 0, i16 addrspace(3)* @module_variable + ret void +} + +; Variables only used by kernels are specialised and allocated per-kernel +@kernel_normal = addrspace(3) global i16 undef +@kernel_overalign = addrspace(3) global i16 undef, align 4 + +; External LDS shall not introduce padding between module and kernel scope variables +@extern_normal = external addrspace(3) global [0 x float] +@extern_overalign = external addrspace(3) global [0 x float], align 8 + +; 2^3 cases encoded into function names + +define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 { +; CHECK-LABEL: module_0_kernel_normal_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: s_add_i32 s0, s0, 4 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b32 v2, v0 +; CHECK-NEXT: s_endpgm + store i16 2, i16 addrspace(3)* @kernel_normal + + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} + +define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { +; CHECK-LABEL: module_1_kernel_normal_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[8:9] +; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_add_i32 s4, s4, 4 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 +; CHECK-NEXT: ds_write_b32 v3, v0 +; CHECK-NEXT: s_endpgm + call void @use_module() + store i16 1, i16 addrspace(3)* @module_variable + + store i16 2, i16 addrspace(3)* @kernel_normal + + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} + +define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1 { +; CHECK-LABEL: module_0_kernel_overalign_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: s_add_i32 s0, s0, 4 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b32 v2, v0 +; CHECK-NEXT: s_endpgm + store i16 2, i16 addrspace(3)* @kernel_overalign + + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} + +define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { +; CHECK-LABEL: module_1_kernel_overalign_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[8:9] +; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 +; CHECK-NEXT: ds_write_b32 v3, v0 +; CHECK-NEXT: s_endpgm + call void @use_module() + store i16 1, i16 addrspace(3)* @module_variable + + store i16 2, i16 addrspace(3)* @kernel_overalign + + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} + +define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1 { +; CHECK-LABEL: module_0_kernel_normal_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: s_add_i32 s0, s0, 8 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b32 v2, v0 +; CHECK-NEXT: s_endpgm + store i16 2, i16 addrspace(3)* @kernel_normal + + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} + +define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { +; CHECK-LABEL: module_1_kernel_normal_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[8:9] +; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 +; CHECK-NEXT: ds_write_b32 v3, v0 +; CHECK-NEXT: s_endpgm + call void @use_module() + store i16 1, i16 addrspace(3)* @module_variable + + store i16 2, i16 addrspace(3)* @kernel_normal + + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} + +define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) #1 { +; CHECK-LABEL: module_0_kernel_overalign_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: s_add_i32 s0, s0, 8 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b32 v2, v0 +; CHECK-NEXT: s_endpgm + store i16 2, i16 addrspace(3)* @kernel_overalign + + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} + +define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) { +; CHECK-LABEL: module_1_kernel_overalign_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[8:9] +; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_add_i32 s4, s4, 8 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 +; CHECK-NEXT: ds_write_b32 v3, v0 +; CHECK-NEXT: s_endpgm + call void @use_module() + store i16 1, i16 addrspace(3)* @module_variable + + store i16 2, i16 addrspace(3)* @kernel_overalign + + %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx + store float 0.0, float addrspace(3)* %arrayidx1 + ret void +} + +attributes #0 = { noinline } +attributes #1 = { "amdgpu-elide-module-lds" }