diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h --- a/llvm/include/llvm/Support/Alignment.h +++ b/llvm/include/llvm/Support/Alignment.h @@ -345,6 +345,8 @@ return Rhs && *Rhs > Lhs ? *Rhs : Lhs; } +inline Align max(Align Lhs, Align Rhs) { return Lhs > Rhs ? Lhs : Rhs; } + #ifndef NDEBUG // For usage in LLVM_DEBUG macros. inline std::string DebugStr(const Align &A) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -9,11 +9,115 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPULDSUtils.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; +// Fix natural alignment for LDS global `GV` based on its size. +static Align fixAlignment(const DataLayout &DL, const GlobalVariable *GV, + Align Alignment) { + TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = max(Alignment, Align(2)); + } + + return Alignment; +} + +// Collect all kernel functions within which `GV` is used. +static std::set collectUsedKernels(const GlobalVariable *GV) { + std::set UsedKernels; + SmallVector UserStack(GV->users()); + SmallPtrSet VisitedUsers; + + while (!UserStack.empty()) { + const auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa(UU)) + continue; + + if (isa(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + const auto *I = dyn_cast(UU); + assert(I && "Expected an instruction"); + const Function *F = I->getFunction(); + if (AMDGPU::isModuleEntryFunctionCC(F->getCallingConv())) + UsedKernels.insert(F); + } + + return UsedKernels; +} + +static std::vector +getSortedUsedLDSGlobals(const Function *K) { + const Module *M = K->getParent(); + const DataLayout &DL = M->getDataLayout(); + + // Collect all stastic LDS globals defined within the module. + std::vector LDSGlobals = + AMDGPU::collectStaticLDSGlobals(M); + + // Collect all those stastic LDS globals which are *used* within kernel K. + std::vector UsedLDSGlobals; + for (const auto *GV : LDSGlobals) { + if (llvm::is_contained(collectUsedKernels(GV), K)) + UsedLDSGlobals.push_back(GV); + } + + // Sort LDS globals (which are used within kernel K) by alignment, descending, + // on ties, by size, descending, on ties, by name, lexicographical. + llvm::stable_sort( + UsedLDSGlobals, + [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { + Align ALHS = fixAlignment(DL, LHS, AMDGPU::getAlign(DL, LHS)); + Align ARHS = fixAlignment(DL, RHS, AMDGPU::getAlign(DL, RHS)); + if (ALHS != ARHS) { + return ALHS > ARHS; + } + + TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); + TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); + if (SLHS != SRHS) { + return SLHS > SRHS; + } + + return LHS->getName() < RHS->getName(); + }); + + // Module LDS which is possibly created by the "Lower Module LDS" pass, should + // be allocated at address 0, irrespective of its size and alignment. + std::vector SortedLDSGlobals; + GlobalVariable *ModuleLDS = M->getGlobalVariable("llvm.amdgcn.module.lds"); + if (ModuleLDS) + SortedLDSGlobals.push_back(ModuleLDS); + + for (const auto *GV : UsedLDSGlobals) { + if (GV != ModuleLDS) + SortedLDSGlobals.push_back(GV); + } + + return SortedLDSGlobals; +} + AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), Mode(MF.getFunction()), IsEntryFunction( @@ -36,6 +140,14 @@ CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); + + if (IsModuleEntryFunction) { + std::vector SortedLDSGlobals = + getSortedUsedLDSGlobals(&F); + + for (const auto *GV : SortedLDSGlobals) + allocateLDSGlobal(F.getParent()->getDataLayout(), *GV); + } } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, @@ -47,6 +159,9 @@ Align Alignment = DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); + // Make sure that natural alignment of `GV` is correctly fixed. + Alignment = fixAlignment(DL, &GV, Alignment); + /// TODO: We should sort these to minimize wasted space due to alignment /// padding. Currently the padding is decided by the first encountered use /// during lowering. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -31,6 +31,8 @@ SmallPtrSet getUsedList(Module &M); +std::vector collectStaticLDSGlobals(const Module *M); + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -122,6 +122,51 @@ return UsedList; } +std::vector collectStaticLDSGlobals(const Module *M) { + std::vector StaticLDSGlobals; + + for (const auto &GV : M->globals()) { + if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + // Skip non LDS global. + continue; + } + + if (GV.hasExternalLinkage()) { + // HIP uses an unsized array `extern __shared__ T s[]` or similar + // zero-sized type in other languages to declare the dynamic shared + // memory variable whose size is not known at the compile time. They will + // be allocated by the runtime and placed directly after the statically + // allocated ones. Skip such dynamic shared memory variables. + assert(GV.getType()->isEmptyTy() && + "Expected dynamic shared memory variable type to be empty"); + continue; + } + + if (!GV.hasInitializer()) { + // Static LDS globals should have "UndefValue" as an initializer. Skip LDS + // globals which do not have an initializer. ISEL pass will catch such + // static LDS globals and report error accordingly. + continue; + } + + if (!isa(GV.getInitializer())) { + // Static LDS globals should have "UndefValue" as an initializer. Skip LDS + // globals which have an initialzer but it is not "UndefValue". ISEL pass + // will catch such static LDS globals and report error accordingly. + continue; + } + + if (GV.isConstant()) { + // A constant LDS global cannot be allocated. Skip it. + continue; + } + + StaticLDSGlobals.push_back(&GV); + } + + return StaticLDSGlobals; +} + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v3, v0 offset:4 +; CHECK-NEXT: ds_read_b32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 @@ -19,7 +19,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-NEXT: flat_store_dword v[0:1], v3 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x200 ; CHECK-NEXT: ds_write_b32 v0, v2 ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll @@ -96,7 +96,7 @@ ; (7 * 8) + (39 * 4) = 212 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: -; HSA: workgroup_group_segment_byte_size = 212 +; HSA: workgroup_group_segment_byte_size = 216 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* @@ -182,7 +182,7 @@ ; align 16, 32, 8 ; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 ; HSA-LABEL: {{^}}test_round_size_3_order2: -; HSA: workgroup_group_segment_byte_size = 150 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -203,7 +203,7 @@ ; align 16, 8, 32 ; 38 + (2 pad) + 38 + (2 pad) + 38 ; HSA-LABEL: {{^}}test_round_size_3_order3: -; HSA: workgroup_group_segment_byte_size = 118 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -224,7 +224,7 @@ ; align 8, 32, 16 ; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 ; HSA-LABEL: {{^}}test_round_size_3_order4: -; HSA: workgroup_group_segment_byte_size = 142 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* @@ -245,7 +245,7 @@ ; align 8, 16, 32 ; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 ; HSA-LABEL: {{^}}test_round_size_3_order5: -; HSA: workgroup_group_segment_byte_size = 126 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -41,11 +41,11 @@ ; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 -; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]] ; GCN: s_barrier -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 28, [[ADDRW]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -31,7 +31,7 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_0: -; GCN: workgroup_group_segment_byte_size = 1060 +; GCN: workgroup_group_segment_byte_size = 1072 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5)