diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -9,6 +9,7 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPULDSUtils.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/Target/TargetMachine.h" @@ -36,6 +37,15 @@ CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); + + if (IsModuleEntryFunction) { + SmallVector SortedLDSGlobals; + AMDGPU::getSortedUsedLDSGlobals(&F, SortedLDSGlobals); + + const DataLayout &DL = F.getParent()->getDataLayout(); + for (const auto *GV : SortedLDSGlobals) + allocateLDSGlobal(DL, *GV); + } } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, @@ -47,9 +57,9 @@ Align Alignment = DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); - /// TODO: We should sort these to minimize wasted space due to alignment - /// padding. Currently the padding is decided by the first encountered use - /// during lowering. + // Make sure that natural alignment of `GV` is correctly fixed. + Alignment = AMDGPU::fixAlignment(DL, &GV, Alignment); + unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); Entry.first->second = Offset; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -23,6 +23,24 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV); +// Fix natural alignment for LDS global `GV` based on its size. +Align fixAlignment(const DataLayout &DL, const GlobalVariable *GV, + Align Alignment); + +// Check if the kernel K uses the LDS global GV. +bool hasKernelUsesLDS(const GlobalVariable *GV, const Function *K); + +// Sort LDS globals (which are used within kernel K) by alignment, descending, +// on ties, by size, descending, on ties, by name, lexicographical. +void sortLDSGlobals(SmallVectorImpl &LDSGlobals, + const DataLayout &DL); + +// Collcet all static LDS globals which are used within kernel K in sorted +// order. +void getSortedUsedLDSGlobals( + const Function *K, + SmallVectorImpl &SortedLDSGlobals); + bool userRequiresLowering(const SmallPtrSetImpl &UsedList, User *InitialUser); @@ -31,6 +49,9 @@ SmallPtrSet getUsedList(Module &M); +void collectStaticLDSGlobals( + const Module *M, SmallVectorImpl &StaticLDSGlobals); + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -29,6 +29,163 @@ GV->getValueType()); } +// Fix natural alignment for LDS global `GV` based on its size. +Align fixAlignment(const DataLayout &DL, const GlobalVariable *GV, + Align Alignment) { + TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = std::max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = std::max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = std::max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = std::max(Alignment, Align(2)); + } + + return Alignment; +} + +// Check if the kernel K uses the LDS global GV. +bool hasKernelUsesLDS(const GlobalVariable *GV, const Function *K) { + SmallVector UserStack(GV->users()); + SmallPtrSet VisitedUsers; + + // Following are the possibilities of GV uses: + // 1. GV is directly used within an instruction as one of its operand. + // 2. GV is used within a constant expression, and this constant expression + // itself is used either within an instruction or as an initiializer to + // a global variable. Though later case is rare, nevertheless, we need to + // check for it. + while (!UserStack.empty()) { + const auto *UU = UserStack.pop_back_val(); + + // No need to handle already visited users. + if (!VisitedUsers.insert(UU).second) + continue; + + // Ignore possible, but, rare uses as in below example. + // @GV = addrspace(3) global int undef + // @GV2 = + // addrspace(1) global int* addrspacecast(int addrspace(3)* @GV to int*) + if (isa(UU)) + continue; + + // Recursively traverse through constant expressions. + if (isa(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + // At this point, user should be an instruction. Check if this instruction + // is from the kernel K, if so, return true. + if (cast(UU)->getFunction() == K) + return true; + } + + // GV is not used within kernel K. return false. + return false; +} + +// Sort LDS globals (which are used within kernel K) by alignment, descending, +// on ties, by size, descending, on ties, by name, lexicographical. +void sortLDSGlobals(SmallVectorImpl &LDSGlobals, + const DataLayout &DL) { + llvm::stable_sort( + LDSGlobals, + [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { + Align ALHS = AMDGPU::fixAlignment(DL, LHS, AMDGPU::getAlign(DL, LHS)); + Align ARHS = AMDGPU::fixAlignment(DL, RHS, AMDGPU::getAlign(DL, RHS)); + if (ALHS != ARHS) { + return ALHS > ARHS; + } + + TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); + TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); + if (SLHS != SRHS) { + return SLHS > SRHS; + } + + return LHS->getName() < RHS->getName(); + }); +} + +void collectStaticLDSGlobals( + const Module *M, + SmallVectorImpl &StaticLDSGlobals) { + + for (const auto &GV : M->globals()) { + if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + continue; + } + + if (!GV.hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another, in which case this transform + // is not required + continue; + } + + if (!isa(GV.getInitializer())) { + // Initializers are unimplemented for local address space. + // Leave such variables in place for consistent error reporting. + continue; + } + + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + continue; + } + + StaticLDSGlobals.push_back(&GV); + } +} + +// Collcet all static LDS globals which are used within kernel K in sorted +// order. +void getSortedUsedLDSGlobals( + const Function *K, + SmallVectorImpl &SortedLDSGlobals) { + const Module *M = K->getParent(); + const DataLayout &DL = M->getDataLayout(); + + // Collect all stastic LDS globals defined within the module. + SmallVector StaticLDSGlobals; + AMDGPU::collectStaticLDSGlobals(M, StaticLDSGlobals); + + // Remove all those stastic LDS globals which are *not used* within kernel K. + StaticLDSGlobals.erase( + std::remove_if(StaticLDSGlobals.begin(), StaticLDSGlobals.end(), + [&](const GlobalVariable *GV) { + return !AMDGPU::hasKernelUsesLDS(GV, K); + }), + StaticLDSGlobals.end()); + + // Sort LDS globals (which are used within kernel K) by alignment, descending, + // on ties, by size, descending, on ties, by name, lexicographical. + AMDGPU::sortLDSGlobals(StaticLDSGlobals, DL); + + // Module LDS which is possibly created by the "Lower Module LDS" pass, should + // be allocated at address 0, irrespective of its size and alignment. + GlobalVariable *ModuleLDS = + M->getGlobalVariable("llvm.amdgcn.module.lds", true); + if (ModuleLDS) + SortedLDSGlobals.push_back(ModuleLDS); + + for (const auto *GV : StaticLDSGlobals) { + if (GV != ModuleLDS) + SortedLDSGlobals.push_back(GV); + } +} + bool userRequiresLowering(const SmallPtrSetImpl &UsedList, User *InitialUser) { // Any LDS variable can be lowered by moving into the created struct @@ -75,36 +232,23 @@ std::vector findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList) { + SmallVector StaticLDSGlobals; + collectStaticLDSGlobals(&M, StaticLDSGlobals); + std::vector LocalVars; - for (auto &GV : M.globals()) { - if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { - continue; - } - if (!GV.hasInitializer()) { - // addrspace(3) without initializer implies cuda/hip extern __shared__ - // the semantics for such a variable appears to be that all extern - // __shared__ variables alias one another, in which case this transform - // is not required - continue; - } - if (!isa(GV.getInitializer())) { - // Initializers are unimplemented for local address space. - // Leave such variables in place for consistent error reporting. - continue; - } - if (GV.isConstant()) { - // A constant undef variable can't be written to, and any load is - // undef, so it should be eliminated by the optimizer. It could be - // dropped by the back end if not. This pass skips over it. - continue; - } - if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) { + + for (const auto *GV : StaticLDSGlobals) { + GlobalVariable *GV2 = const_cast(GV); + + if (std::none_of(GV2->user_begin(), GV2->user_end(), [&](User *U) { return userRequiresLowering(UsedList, U); })) { continue; } - LocalVars.push_back(&GV); + + LocalVars.push_back(GV2); } + return LocalVars; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-allocation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-allocation.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-allocation.ll @@ -0,0 +1,481 @@ +; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck -check-prefixes=HSA,LDS-USED %s + +; LDS allocation is now done in the sorted order as below: +; +; 1. First, alignment is fixed, even for missing alignment or underaligned cases since +; we know the size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties, +; based on size. +; 3. Finally, memory is allocated in the above sorted order. + + +; [TEST 0] All LDS are properly aligned +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.16.align.16 -> 16 byte boundary +; @lds.size.8.align.8 -> 8 byte boundary +; @lds.size.4.align.4 -> 4 byte boundary +; @lds.size.2.align.2 -> 2 byte boundary +; @lds.size.1.align.1 -> 1 byte boundary (any boundary) +; +; Sorted order based on alignment should be: +; [@lds.size.16.align.16, @lds.size.8.align.8, @lds.size.4.align.4, @lds.size.2.align.2, @lds.size.1.align.1] +; +; Memory allocated in the sorted order should be: +; 16 + 8 + 4 + 2 + 1 [= 31 bytes] + +@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2 +@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4 +@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 +@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_0: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 30 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 28 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 24 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 4 +; HSA: v_mov_b32_e32 v1, 16 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 5 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 31 +define amdgpu_kernel void @fix_alignment_0() { + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 + + %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 + + %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.8.align.8.bc, align 8 + + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 5, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + + ret void +} + + +; [TEST 1] All LDS are underaligned, requires to allocate on different alignment boundary +; +; Fixed alignments for allocation should be: +; @lds.size.9.align.8 -> 16 byte boundary +; @lds.size.5.align.4 -> 8 byte boundary +; @lds.size.3.align.2 -> 4 byte boundary +; @lds.size.2.align.1 -> 2 byte boundary +; +; Sorted order based on *fixed* alignment should be: +; [@lds.size.9.align.8, lds.size.5.align.4, lds.size.3.align.2, lds.size.2.align.1] +; +; Memory allocated in the sorted order should be: +; 9 + (pad 7) + 5 + (pad 3) + 3 + (pad 1) + 2 [= 30 bytes] + +@lds.size.2.align.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 +@lds.size.3.align.2 = internal unnamed_addr addrspace(3) global [3 x i8] undef, align 2 +@lds.size.5.align.4 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 4 +@lds.size.9.align.8 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 8 + +; HSA-LABEL: {{^}}fix_alignment_1: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 28 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 24 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 16 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 4 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 30 +define amdgpu_kernel void @fix_alignment_1() { + %lds.size.2.align.1.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.2.align.1.bc, align 1 + + %lds.size.3.align.2.bc = bitcast [3 x i8] addrspace(3)* @lds.size.3.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.3.align.2.bc, align 2 + + %lds.size.5.align.4.bc = bitcast [5 x i8] addrspace(3)* @lds.size.5.align.4 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.5.align.4.bc, align 4 + + %lds.size.9.align.8.bc = bitcast [9 x i8] addrspace(3)* @lds.size.9.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.9.align.8.bc, align 8 + + ret void +} + + +; [TEST 2] All LDS are underaligned, requires to allocate on 16 byte boundary +; +; Fixed alignments for allocation should be: +; @lds.size.9.align.1 -> 16 byte boundary +; @lds.size.10.align.2 -> 16 byte boundary +; @lds.size.11.align.4 -> 16 byte boundary +; @lds.size.12.align.8 -> 16 byte boundary +; +; Sorted order based on *fixed* alignment and then by size should be: +; [@lds.size.12.align.8, @lds.size.11.align.4, @lds.size.10.align.2, @lds.size.9.align.1] +; +; Memory allocated in the sorted order should be: +; 12 + (pad 4) + 11 + (pad 5) + 10 + (pad 6) + 9 [= 57 bytes] + +@lds.size.9.align.1 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 1 +@lds.size.10.align.2 = internal unnamed_addr addrspace(3) global [10 x i8] undef, align 2 +@lds.size.11.align.4 = internal unnamed_addr addrspace(3) global [11 x i8] undef, align 4 +@lds.size.12.align.8 = internal unnamed_addr addrspace(3) global [12 x i8] undef, align 8 + +; HSA-LABEL: {{^}}fix_alignment_2: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 48 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 32 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 16 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 4 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 57 +define amdgpu_kernel void @fix_alignment_2() { + %lds.size.9.align.1.bc = bitcast [9 x i8] addrspace(3)* @lds.size.9.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.9.align.1.bc, align 1 + + %lds.size.10.align.2.bc = bitcast [10 x i8] addrspace(3)* @lds.size.10.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.10.align.2.bc, align 2 + + %lds.size.11.align.4.bc = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.4 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.11.align.4.bc, align 4 + + %lds.size.12.align.8.bc = bitcast [12 x i8] addrspace(3)* @lds.size.12.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.12.align.8.bc, align 8 + + ret void +} + + +; [TEST 3] All LDS are underaligned, requires to allocate on 8 byte boundary +; +; Fixed alignments for allocation should be: +; @lds.size.5.align.2 -> 8 byte boundary +; @lds.size.6.align.2 -> 8 byte boundary +; @lds.size.7.align.2 -> 8 byte boundary +; @lds.size.7.align.4 -> 8 byte boundary +; +; Sorted order based on *fixed* alignment and then by size should be: +; [@lds.size.7.align.4, @lds.size.7.align.2, @lds.size.6.align.2, @lds.size.5.align.2] +; OR +; [@lds.size.7.align.2, @lds.size.7.align.4, @lds.size.6.align.2, @lds.size.5.align.2] +; +; Memory allocated in the sorted order should be: +; 7 + (pad 1) + 7 + (pad 1) + 6 + (pad 2) + 5 [= 29 bytes] + +@lds.size.5.align.2 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 2 +@lds.size.6.align.2 = internal unnamed_addr addrspace(3) global [6 x i8] undef, align 2 +@lds.size.7.align.2 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 2 +@lds.size.7.align.4 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 4 + +; HSA-LABEL: {{^}}fix_alignment_3: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 24 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 16 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 4 +; HSA: v_mov_b32_e32 v1, 8 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 29 +define amdgpu_kernel void @fix_alignment_3() { + %lds.size.5.align.2.bc = bitcast [5 x i8] addrspace(3)* @lds.size.5.align.2 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.5.align.2.bc, align 2 + + %lds.size.6.align.2.bc = bitcast [6 x i8] addrspace(3)* @lds.size.6.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.6.align.2.bc, align 2 + + %lds.size.7.align.2.bc = bitcast [7 x i8] addrspace(3)* @lds.size.7.align.2 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.7.align.2.bc, align 2 + + %lds.size.7.align.4.bc = bitcast [7 x i8] addrspace(3)* @lds.size.7.align.4 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.7.align.4.bc, align 4 + + ret void +} + + +; [TEST 4] All LDS are of same alignment, but different size +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.17.align.16 -> 16 byte boundary +; @lds.size.18.align.16 -> 16 byte boundary +; @lds.size.19.align.16 -> 16 byte boundary +; @lds.size.20.align.16 -> 16 byte boundary +; +; Sorted order based on *size* should be: +; [@lds.size.20.align.16, @lds.size.19.align.16, @lds.size.18.align.16, @lds.size.17.align.16] +; +; Memory allocated in the sorted order should be: +; 20 + (pad 12) + 19 + (pad 13) + 18 + (pad 14) + 17 [= 113 bytes] + +@lds.size.17.align.16 = internal unnamed_addr addrspace(3) global [17 x i8] undef, align 16 +@lds.size.18.align.16 = internal unnamed_addr addrspace(3) global [18 x i8] undef, align 16 +@lds.size.19.align.16 = internal unnamed_addr addrspace(3) global [19 x i8] undef, align 16 +@lds.size.20.align.16 = internal unnamed_addr addrspace(3) global [20 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_4: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 0x60 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 64 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 32 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 4 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 113 +define amdgpu_kernel void @fix_alignment_4() { + %lds.size.17.align.16.bc = bitcast [17 x i8] addrspace(3)* @lds.size.17.align.16 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.17.align.16.bc, align 16 + + %lds.size.18.align.16.bc = bitcast [18 x i8] addrspace(3)* @lds.size.18.align.16 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.18.align.16.bc, align 16 + + %lds.size.19.align.16.bc = bitcast [19 x i8] addrspace(3)* @lds.size.19.align.16 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.19.align.16.bc, align 16 + + %lds.size.20.align.16.bc = bitcast [20 x i8] addrspace(3)* @lds.size.20.align.16 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.20.align.16.bc, align 16 + + ret void +} + +; [TEST 5] All LDS are of size 1, but different alignments. +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.1.align.1 -> any boundary +; @lds.size.1.align.2 -> any boundary +; @lds.size.1.align.4 -> any boundary +; @lds.size.1.align.8 -> any boundary +; @lds.size.1.align.16 -> any boundary +; +; Sorted order based on *size* should be: +; [@size.1.align.16, @size.1.align.8, @size.1.align.4, @size.1.align.2, size.1.align.1] +; +; Memory allocated in the sorted order should be: +; 1 + [gap 7] + 1 + [gap 3] + 1 + [gap 1] + 1 + 1 [= 16 bytes] + +@size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@size.1.align.2 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 2 +@size.1.align.4 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 4 +@size.1.align.8 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 8 +@size.1.align.16 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_5: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 15 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 14 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 12 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 4 +; HSA: v_mov_b32_e32 v1, 8 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 5 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 16 +define amdgpu_kernel void @fix_alignment_5() { + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.1.align.2.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.1.align.2.bc, align 2 + + %lds.size.1.align.4.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.4 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.1.align.4.bc, align 4 + + %lds.size.1.align.8.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.1.align.8.bc, align 8 + + %lds.size.1.align.16.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.16 to i8 addrspace(3)* + store i8 5, i8 addrspace(3)* %lds.size.1.align.16.bc, align 16 + + ret void +} + +; [TEST 6] LDS used in multiple kernels. +; +; Fixed alignments for allocation should be: +; @lds.size.33.align.8 -> 16 bytes boundary +; @lds.size.67.align.8 -> 16 bytes boundary +; @lds.size.93.align.8 -> 16 bytes boundary +; @lds.size.11.align.8 -> 16 bytes boundary +; @lds.size.13.align.8 -> 16 bytes boundary +; @lds.size.23.align.8 -> 16 bytes boundary +; +; kernel : @fix_alignment_6 +; +; Sorted order based on *size* should be: +; [@lds.size.93.align.8, @lds.size.67.align.8, @lds.size.33.align.8] +; +; Memory allocated in the sorted order should be: +; 93 + [pad 3] + 67 + [pad 13] + 33 [= 209 bytes] +; +; kernel : @fix_alignment_7 +; +; Sorted order based on *size* should be: +; [@lds.size.93.align.8, @lds.size.13.align.8, @lds.size.11.align.8] +; +; Memory allocated in the sorted order should be: +; 93 + [pad 3] + 13 + [pad 3] + 11 [= 123 bytes] + +; kernel : @fix_alignment_8 +; +; Sorted order based on *size* should be: +; [@lds.size.93.align.8, @lds.size.67.align.8, @lds.size.33.align.8, @lds.size.13.align.8, @lds.size.11.align.8] +; +; Memory allocated in the sorted order should be: +; 93 + [pad 3] + 67 + [pad 13] + 33 + [pad 15] + 13 + [3] + 11 [= 251 bytes] + +@lds.size.33.align.8 = internal unnamed_addr addrspace(3) global [33 x i8] undef, align 8 +@lds.size.67.align.8 = internal unnamed_addr addrspace(3) global [67 x i8] undef, align 8 +@lds.size.93.align.8 = internal unnamed_addr addrspace(3) global [93 x i8] undef, align 8 +@lds.size.11.align.8 = internal unnamed_addr addrspace(3) global [11 x i8] undef, align 8 +@lds.size.13.align.8 = internal unnamed_addr addrspace(3) global [13 x i8] undef, align 8 + +; HSA-LABEL: {{^}}fix_alignment_6: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 0xb0 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 0x60 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 209 +define amdgpu_kernel void @fix_alignment_6() { + %lds.size.33.align.8.bc = bitcast [33 x i8] addrspace(3)* @lds.size.33.align.8 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.33.align.8.bc, align 8 + + %lds.size.67.align.8.bc = bitcast [67 x i8] addrspace(3)* @lds.size.67.align.8 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.67.align.8.bc, align 2 + + %lds.size.93.align.8.bc = bitcast [93 x i8] addrspace(3)* @lds.size.93.align.8 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.93.align.8.bc, align 4 + + ret void +} + +; HSA-LABEL: {{^}}fix_alignment_7: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 0x70 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 0x60 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 123 +define amdgpu_kernel void @fix_alignment_7() { + %lds.size.93.align.8.bc = bitcast [93 x i8] addrspace(3)* @lds.size.93.align.8 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.93.align.8.bc, align 4 + + %lds.size.11.align.8.bc = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.8 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.11.align.8.bc, align 8 + + %lds.size.13.align.8.bc = bitcast [13 x i8] addrspace(3)* @lds.size.13.align.8 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.13.align.8.bc, align 2 + + ret void +} + +; HSA-LABEL: {{^}}fix_alignment_8: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 0xf0 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v2, 0xb0 +; HSA: ds_write_b8 v2, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v2, 0x60 +; HSA: ds_write_b8 v2, v0 +; HSA: v_mov_b32_e32 v0, 4 +; HSA: v_mov_b32_e32 v2, 0 +; HSA: ds_write_b8 v2, v0 +; HSA: v_mov_b32_e32 v0, 5 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 6 +; HSA: v_mov_b32_e32 v1, 0xe0 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 7 +; HSA: ds_write_b8 v2, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 251 +define amdgpu_kernel void @fix_alignment_8() { + %lds.size.11.align.8.bc.1 = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.8 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.11.align.8.bc.1, align 8 + + %lds.size.33.align.8.bc.2 = bitcast [33 x i8] addrspace(3)* @lds.size.33.align.8 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.33.align.8.bc.2, align 8 + + %lds.size.67.align.8.bc.3 = bitcast [67 x i8] addrspace(3)* @lds.size.67.align.8 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.67.align.8.bc.3, align 2 + + %lds.size.93.align.8.bc.4 = bitcast [93 x i8] addrspace(3)* @lds.size.93.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.93.align.8.bc.4, align 4 + + %lds.size.11.align.8.bc.5 = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.8 to i8 addrspace(3)* + store i8 5, i8 addrspace(3)* %lds.size.11.align.8.bc.5, align 8 + + %lds.size.13.align.8.bc.6 = bitcast [13 x i8] addrspace(3)* @lds.size.13.align.8 to i8 addrspace(3)* + store i8 6, i8 addrspace(3)* %lds.size.13.align.8.bc.6, align 2 + + %lds.size.93.align.8.bc.7 = bitcast [93 x i8] addrspace(3)* @lds.size.93.align.8 to i8 addrspace(3)* + store i8 7, i8 addrspace(3)* %lds.size.93.align.8.bc.7, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-allocation2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-allocation2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-allocation2.ll @@ -0,0 +1,61 @@ +; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck -check-prefixes=HSA,LDS-USED %s + +; LDS allocation is now done in the sorted order as below: +; +; 1. First, alignment is fixed, even for missing alignment or underaligned cases since +; we know the size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties, +; based on size. +; 3. Finally, memory is allocated in the above sorted order. + +; [TEST 0] The variable llvm.amdgcn.module.lds should be allocated at address 0 +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.111.align.16 -> 16 byte boundary +; @lds.size.222.align.16 -> 16 byte boundary +; @lds.size.333.align.16 -> 16 byte boundary +; @llvm.amdgcn.module.lds -> 16 byte boundary +; +; Sorted order should be: +; [@llvm.amdgcn.module.lds, @lds.size.333.align.16, @lds.size.222.align.16, @lds.size.111.align.16] +; +; Memory allocated in the sorted order should be: +; 16 + 333 + [pad 3] + 222 + [pad 2] + 111 [= 687 bytes] + +@lds.size.111.align.16 = internal unnamed_addr addrspace(3) global [111 x i8] undef, align 16 +@lds.size.222.align.16 = internal unnamed_addr addrspace(3) global [222 x i8] undef, align 16 +@lds.size.333.align.16 = internal unnamed_addr addrspace(3) global [333 x i8] undef, align 16 +@llvm.amdgcn.module.lds = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_0: +; HSA: v_mov_b32_e32 v0, 1 +; HSA: v_mov_b32_e32 v1, 0x240 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 2 +; HSA: v_mov_b32_e32 v1, 0x160 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 3 +; HSA: v_mov_b32_e32 v1, 16 +; HSA: ds_write_b8 v1, v0 +; HSA: v_mov_b32_e32 v0, 4 +; HSA: v_mov_b32_e32 v1, 0 +; HSA: ds_write_b8 v1, v0 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 687 +define amdgpu_kernel void @fix_alignment_0() { + %lds.size.111.align.16.bc = bitcast [111 x i8] addrspace(3)* @lds.size.111.align.16 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.111.align.16.bc, align 16 + + %lds.size.222.align.16.bc = bitcast [222 x i8] addrspace(3)* @lds.size.222.align.16 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.222.align.16.bc, align 16 + + %lds.size.333.align.16.bc = bitcast [333 x i8] addrspace(3)* @lds.size.333.align.16 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.333.align.16.bc, align 16 + + %llvm.amdgcn.module.lds.bc = bitcast [16 x i8] addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %llvm.amdgcn.module.lds.bc, align 16 + + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v3, v0 offset:4 +; CHECK-NEXT: ds_read_b32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 @@ -19,7 +19,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-NEXT: flat_store_dword v[0:1], v3 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x200 ; CHECK-NEXT: ds_write_b32 v0, v2 ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll @@ -1,5 +1,11 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 < %s | FileCheck -check-prefix=HSA %s +; LDS allocation is now done in the sorted order: +; 1. First, alignment is fixed, even for missing alignment or underaligned cases since we know the +; size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties, based on size. +; 3. Finally, memory is allocated in the above sorted order. + @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 @@ -25,7 +31,7 @@ ; There are two objects, so one requires padding to be correctly ; aligned after the other. -; (38 -> 48) + 38 = 92 +; 38 + (10 pad) + 38 = 86 ; I don't think it is necessary to add padding after since if there ; were to be a dynamically sized LDS kernel arg, the runtime should @@ -94,9 +100,9 @@ ret void } -; (7 * 8) + (39 * 4) = 212 +; 156 + (4 pad) + 56 = 216 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: -; HSA: workgroup_group_segment_byte_size = 212 +; HSA: workgroup_group_segment_byte_size = 216 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* @@ -110,7 +116,7 @@ ret void } -; (39 * 4) + (4 pad) + (7 * 8) = 216 +; 156 + (4 pad) + 56 = 216 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: ; HSA: workgroup_group_segment_byte_size = 216 ; HSA: group_segment_alignment = 4 @@ -158,7 +164,7 @@ ret void } -; align 32, 8, 16 +; align 32, 16, 8 ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 ; HSA-LABEL: {{^}}test_round_size_3_order1: ; HSA: workgroup_group_segment_byte_size = 134 @@ -179,10 +185,10 @@ ret void } -; align 16, 32, 8 -; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 +; align 32, 16, 8 +; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 ; HSA-LABEL: {{^}}test_round_size_3_order2: -; HSA: workgroup_group_segment_byte_size = 150 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -200,10 +206,10 @@ ret void } -; align 16, 8, 32 +; align 32, 16, 8 ; 38 + (2 pad) + 38 + (2 pad) + 38 ; HSA-LABEL: {{^}}test_round_size_3_order3: -; HSA: workgroup_group_segment_byte_size = 118 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -221,10 +227,10 @@ ret void } -; align 8, 32, 16 -; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 +; align 32, 16, 8 +; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 ; HSA-LABEL: {{^}}test_round_size_3_order4: -; HSA: workgroup_group_segment_byte_size = 142 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* @@ -242,10 +248,10 @@ ret void } -; align 8, 16, 32 -; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 +; align 32, 16, 8 +; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 ; HSA-LABEL: {{^}}test_round_size_3_order5: -; HSA: workgroup_group_segment_byte_size = 126 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* diff --git a/llvm/test/CodeGen/AMDGPU/lds-allocation.ll b/llvm/test/CodeGen/AMDGPU/lds-allocation.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-allocation.ll @@ -0,0 +1,450 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck -check-prefixes=HSA,LDS-USED %s + +; LDS allocation is now done in the sorted order as below: +; +; 1. First, alignment is fixed, even for missing alignment or underaligned cases since +; we know the size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties, +; based on size. +; 3. Finally, memory is allocated in the above sorted order. + + +; [TEST 0] All LDS are properly aligned +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.16.align.16 -> 16 byte boundary +; @lds.size.8.align.8 -> 8 byte boundary +; @lds.size.4.align.4 -> 4 byte boundary +; @lds.size.2.align.2 -> 2 byte boundary +; @lds.size.1.align.1 -> 1 byte boundary (any boundary) +; +; Sorted order based on alignment should be: +; [@lds.size.16.align.16, @lds.size.8.align.8, @lds.size.4.align.4, @lds.size.2.align.2, @lds.size.1.align.1] +; +; Memory allocated in the sorted order should be: +; 16 + 8 + 4 + 2 + 1 [= 31 bytes] + +@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2 +@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4 +@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 +@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_0: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 1 +; HSA: ds_write_b8 v0, v1 offset:30 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:28 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 offset:24 +; HSA: v_mov_b32_e32 v1, 4 +; HSA: ds_write_b8 v0, v1 offset:16 +; HSA: v_mov_b32_e32 v1, 5 +; HSA: ds_write_b8 v0, v1 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 31 +define amdgpu_kernel void @fix_alignment_0() { + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 + + %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 + + %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.8.align.8.bc, align 8 + + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 5, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + + ret void +} + + +; [TEST 1] All LDS are underaligned, requires to allocate on different alignment boundary +; +; Fixed alignments for allocation should be: +; @lds.size.9.align.8 -> 16 byte boundary +; @lds.size.5.align.4 -> 8 byte boundary +; @lds.size.3.align.2 -> 4 byte boundary +; @lds.size.2.align.1 -> 2 byte boundary +; +; Sorted order based on *fixed* alignment should be: +; [@lds.size.9.align.8, lds.size.5.align.4, lds.size.3.align.2, lds.size.2.align.1] +; +; Memory allocated in the sorted order should be: +; 9 + (pad 7) + 5 + (pad 3) + 3 + (pad 1) + 2 [= 30 bytes] + +@lds.size.2.align.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 +@lds.size.3.align.2 = internal unnamed_addr addrspace(3) global [3 x i8] undef, align 2 +@lds.size.5.align.4 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 4 +@lds.size.9.align.8 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 8 + +; HSA-LABEL: {{^}}fix_alignment_1: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 1 +; HSA: ds_write_b8 v0, v1 offset:28 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:24 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 offset:16 +; HSA: v_mov_b32_e32 v1, 4 +; HSA: ds_write_b8 v0, v1 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 30 +define amdgpu_kernel void @fix_alignment_1() { + %lds.size.2.align.1.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.2.align.1.bc, align 1 + + %lds.size.3.align.2.bc = bitcast [3 x i8] addrspace(3)* @lds.size.3.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.3.align.2.bc, align 2 + + %lds.size.5.align.4.bc = bitcast [5 x i8] addrspace(3)* @lds.size.5.align.4 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.5.align.4.bc, align 4 + + %lds.size.9.align.8.bc = bitcast [9 x i8] addrspace(3)* @lds.size.9.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.9.align.8.bc, align 8 + + ret void +} + + +; [TEST 2] All LDS are underaligned, requires to allocate on 16 byte boundary +; +; Fixed alignments for allocation should be: +; @lds.size.9.align.1 -> 16 byte boundary +; @lds.size.10.align.2 -> 16 byte boundary +; @lds.size.11.align.4 -> 16 byte boundary +; @lds.size.12.align.8 -> 16 byte boundary +; +; Sorted order based on *fixed* alignment and then by size should be: +; [@lds.size.12.align.8, @lds.size.11.align.4, @lds.size.10.align.2, @lds.size.9.align.1] +; +; Memory allocated in the sorted order should be: +; 12 + (pad 4) + 11 + (pad 5) + 10 + (pad 6) + 9 [= 57 bytes] + +@lds.size.9.align.1 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 1 +@lds.size.10.align.2 = internal unnamed_addr addrspace(3) global [10 x i8] undef, align 2 +@lds.size.11.align.4 = internal unnamed_addr addrspace(3) global [11 x i8] undef, align 4 +@lds.size.12.align.8 = internal unnamed_addr addrspace(3) global [12 x i8] undef, align 8 + +; HSA-LABEL: {{^}}fix_alignment_2: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 1 +; HSA: ds_write_b8 v0, v1 offset:48 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:32 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 offset:16 +; HSA: v_mov_b32_e32 v1, 4 +; HSA: ds_write_b8 v0, v1 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 57 +define amdgpu_kernel void @fix_alignment_2() { + %lds.size.9.align.1.bc = bitcast [9 x i8] addrspace(3)* @lds.size.9.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.9.align.1.bc, align 1 + + %lds.size.10.align.2.bc = bitcast [10 x i8] addrspace(3)* @lds.size.10.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.10.align.2.bc, align 2 + + %lds.size.11.align.4.bc = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.4 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.11.align.4.bc, align 4 + + %lds.size.12.align.8.bc = bitcast [12 x i8] addrspace(3)* @lds.size.12.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.12.align.8.bc, align 8 + + ret void +} + + +; [TEST 3] All LDS are underaligned, requires to allocate on 8 byte boundary +; +; Fixed alignments for allocation should be: +; @lds.size.5.align.2 -> 8 byte boundary +; @lds.size.6.align.2 -> 8 byte boundary +; @lds.size.7.align.2 -> 8 byte boundary +; @lds.size.7.align.4 -> 8 byte boundary +; +; Sorted order based on *fixed* alignment and then by size should be: +; [@lds.size.7.align.4, @lds.size.7.align.2, @lds.size.6.align.2, @lds.size.5.align.2] +; OR +; [@lds.size.7.align.2, @lds.size.7.align.4, @lds.size.6.align.2, @lds.size.5.align.2] +; +; Memory allocated in the sorted order should be: +; 7 + (pad 1) + 7 + (pad 1) + 6 + (pad 2) + 5 [= 29 bytes] + +@lds.size.5.align.2 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 2 +@lds.size.6.align.2 = internal unnamed_addr addrspace(3) global [6 x i8] undef, align 2 +@lds.size.7.align.2 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 2 +@lds.size.7.align.4 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 4 + +; HSA-LABEL: {{^}}fix_alignment_3: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 1 +; HSA: ds_write_b8 v0, v1 offset:24 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:16 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 +; HSA: v_mov_b32_e32 v1, 4 +; HSA: ds_write_b8 v0, v1 offset:8 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 29 +define amdgpu_kernel void @fix_alignment_3() { + %lds.size.5.align.2.bc = bitcast [5 x i8] addrspace(3)* @lds.size.5.align.2 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.5.align.2.bc, align 2 + + %lds.size.6.align.2.bc = bitcast [6 x i8] addrspace(3)* @lds.size.6.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.6.align.2.bc, align 2 + + %lds.size.7.align.2.bc = bitcast [7 x i8] addrspace(3)* @lds.size.7.align.2 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.7.align.2.bc, align 2 + + %lds.size.7.align.4.bc = bitcast [7 x i8] addrspace(3)* @lds.size.7.align.4 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.7.align.4.bc, align 4 + + ret void +} + + +; [TEST 4] All LDS are of same alignment, but different size +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.17.align.16 -> 16 byte boundary +; @lds.size.18.align.16 -> 16 byte boundary +; @lds.size.19.align.16 -> 16 byte boundary +; @lds.size.20.align.16 -> 16 byte boundary +; +; Sorted order based on *size* should be: +; [@lds.size.20.align.16, @lds.size.19.align.16, @lds.size.18.align.16, @lds.size.17.align.16] +; +; Memory allocated in the sorted order should be: +; 20 + (pad 12) + 19 + (pad 13) + 18 + (pad 14) + 17 [= 113 bytes] + +@lds.size.17.align.16 = internal unnamed_addr addrspace(3) global [17 x i8] undef, align 16 +@lds.size.18.align.16 = internal unnamed_addr addrspace(3) global [18 x i8] undef, align 16 +@lds.size.19.align.16 = internal unnamed_addr addrspace(3) global [19 x i8] undef, align 16 +@lds.size.20.align.16 = internal unnamed_addr addrspace(3) global [20 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_4: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 1 +; HSA: ds_write_b8 v0, v1 offset:96 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:64 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 offset:32 +; HSA: v_mov_b32_e32 v1, 4 +; HSA: ds_write_b8 v0, v1 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 113 +define amdgpu_kernel void @fix_alignment_4() { + %lds.size.17.align.16.bc = bitcast [17 x i8] addrspace(3)* @lds.size.17.align.16 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.17.align.16.bc, align 16 + + %lds.size.18.align.16.bc = bitcast [18 x i8] addrspace(3)* @lds.size.18.align.16 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.18.align.16.bc, align 16 + + %lds.size.19.align.16.bc = bitcast [19 x i8] addrspace(3)* @lds.size.19.align.16 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.19.align.16.bc, align 16 + + %lds.size.20.align.16.bc = bitcast [20 x i8] addrspace(3)* @lds.size.20.align.16 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.20.align.16.bc, align 16 + + ret void +} + +; [TEST 5] All LDS are of size 1, but different alignments. +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @size.1.align.1 -> any boundary +; @size.1.align.2 -> any boundary +; @size.1.align.4 -> any boundary +; @size.1.align.8 -> any boundary +; @size.1.align.16 -> any boundary +; +; Sorted order based on *size* should be: +; [@size.1.align.16, @size.1.align.8, @size.1.align.4, @size.1.align.2, size.1.align.1] +; +; Memory allocated in the sorted order should be: +; 1 + [gap 7] + 1 + [gap 3] + 1 + [gap 1] + 1 + 1 [= 16 bytes] + +@size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@size.1.align.2 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 2 +@size.1.align.4 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 4 +@size.1.align.8 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 8 +@size.1.align.16 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_5: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 1 +; HSA: ds_write_b8 v0, v1 offset:15 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:14 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 offset:12 +; HSA: v_mov_b32_e32 v1, 4 +; HSA: ds_write_b8 v0, v1 offset:8 +; HSA: v_mov_b32_e32 v1, 5 +; HSA: ds_write_b8 v0, v1 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 16 +define amdgpu_kernel void @fix_alignment_5() { + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.1.align.2.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.1.align.2.bc, align 2 + + %lds.size.1.align.4.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.4 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.1.align.4.bc, align 4 + + %lds.size.1.align.8.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.1.align.8.bc, align 8 + + %lds.size.1.align.16.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.16 to i8 addrspace(3)* + store i8 5, i8 addrspace(3)* %lds.size.1.align.16.bc, align 16 + + ret void +} + + +; [TEST 6] LDS used in multiple kernels. +; +; Fixed alignments for allocation should be: +; @lds.size.33.align.8 -> 16 bytes boundary +; @lds.size.67.align.8 -> 16 bytes boundary +; @lds.size.93.align.8 -> 16 bytes boundary +; @lds.size.11.align.8 -> 16 bytes boundary +; @lds.size.13.align.8 -> 16 bytes boundary +; @lds.size.23.align.8 -> 16 bytes boundary +; +; kernel : @fix_alignment_6 +; +; Sorted order based on *size* should be: +; [@lds.size.93.align.8, @lds.size.67.align.8, @lds.size.33.align.8] +; +; Memory allocated in the sorted order should be: +; 93 + [pad 3] + 67 + [pad 13] + 33 [= 209 bytes] +; +; kernel : @fix_alignment_7 +; +; Sorted order based on *size* should be: +; [@lds.size.93.align.8, @lds.size.13.align.8, @lds.size.11.align.8] +; +; Memory allocated in the sorted order should be: +; 93 + [pad 3] + 13 + [pad 3] + 11 [= 123 bytes] + +; kernel : @fix_alignment_8 +; +; Sorted order based on *size* should be: +; [@lds.size.93.align.8, @lds.size.67.align.8, @lds.size.33.align.8, @lds.size.13.align.8, @lds.size.11.align.8] +; +; Memory allocated in the sorted order should be: +; 93 + [pad 3] + 67 + [pad 13] + 33 + [pad 15] + 13 + [3] + 11 [= 251 bytes] + +@lds.size.33.align.8 = internal unnamed_addr addrspace(3) global [33 x i8] undef, align 8 +@lds.size.67.align.8 = internal unnamed_addr addrspace(3) global [67 x i8] undef, align 8 +@lds.size.93.align.8 = internal unnamed_addr addrspace(3) global [93 x i8] undef, align 8 +@lds.size.11.align.8 = internal unnamed_addr addrspace(3) global [11 x i8] undef, align 8 +@lds.size.13.align.8 = internal unnamed_addr addrspace(3) global [13 x i8] undef, align 8 + +; HSA-LABEL: {{^}}fix_alignment_6: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 1 +; HSA: ds_write_b8 v0, v1 offset:176 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:96 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 209 +define amdgpu_kernel void @fix_alignment_6() { + %lds.size.33.align.8.bc = bitcast [33 x i8] addrspace(3)* @lds.size.33.align.8 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.33.align.8.bc, align 8 + + %lds.size.67.align.8.bc = bitcast [67 x i8] addrspace(3)* @lds.size.67.align.8 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.67.align.8.bc, align 2 + + %lds.size.93.align.8.bc = bitcast [93 x i8] addrspace(3)* @lds.size.93.align.8 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.93.align.8.bc, align 4 + + ret void +} + +; HSA-LABEL: {{^}}fix_alignment_7: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 1 +; HSA: ds_write_b8 v0, v1 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:112 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 offset:96 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 123 +define amdgpu_kernel void @fix_alignment_7() { + %lds.size.93.align.8.bc = bitcast [93 x i8] addrspace(3)* @lds.size.93.align.8 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.93.align.8.bc, align 4 + + %lds.size.11.align.8.bc = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.8 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.11.align.8.bc, align 8 + + %lds.size.13.align.8.bc = bitcast [13 x i8] addrspace(3)* @lds.size.13.align.8 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.13.align.8.bc, align 2 + + ret void +} + +; HSA-LABEL: {{^}}fix_alignment_8: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 2 +; HSA: ds_write_b8 v0, v1 offset:176 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 offset:96 +; HSA: v_mov_b32_e32 v1, 5 +; HSA: ds_write_b8 v0, v1 offset:240 +; HSA: v_mov_b32_e32 v1, 6 +; HSA: ds_write_b8 v0, v1 offset:224 +; HSA: v_mov_b32_e32 v1, 7 +; HSA: ds_write_b8 v0, v1 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 251 +define amdgpu_kernel void @fix_alignment_8() { + %lds.size.11.align.8.bc.1 = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.8 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.11.align.8.bc.1, align 8 + + %lds.size.33.align.8.bc.2 = bitcast [33 x i8] addrspace(3)* @lds.size.33.align.8 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.33.align.8.bc.2, align 8 + + %lds.size.67.align.8.bc.3 = bitcast [67 x i8] addrspace(3)* @lds.size.67.align.8 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.67.align.8.bc.3, align 2 + + %lds.size.93.align.8.bc.4 = bitcast [93 x i8] addrspace(3)* @lds.size.93.align.8 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.93.align.8.bc.4, align 4 + + %lds.size.11.align.8.bc.5 = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.8 to i8 addrspace(3)* + store i8 5, i8 addrspace(3)* %lds.size.11.align.8.bc.5, align 8 + + %lds.size.13.align.8.bc.6 = bitcast [13 x i8] addrspace(3)* @lds.size.13.align.8 to i8 addrspace(3)* + store i8 6, i8 addrspace(3)* %lds.size.13.align.8.bc.6, align 2 + + %lds.size.93.align.8.bc.7 = bitcast [93 x i8] addrspace(3)* @lds.size.93.align.8 to i8 addrspace(3)* + store i8 7, i8 addrspace(3)* %lds.size.93.align.8.bc.7, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-allocation2.ll b/llvm/test/CodeGen/AMDGPU/lds-allocation2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-allocation2.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck -check-prefixes=HSA,LDS-USED %s + +; LDS allocation is now done in the sorted order as below: +; +; 1. First, alignment is fixed, even for missing alignment or underaligned cases since +; we know the size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties, +; based on size. +; 3. Finally, memory is allocated in the above sorted order. + +; [TEST 0] The variable llvm.amdgcn.module.lds should be allocated at address 0 +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.111.align.16 -> 16 byte boundary +; @lds.size.222.align.16 -> 16 byte boundary +; @lds.size.333.align.16 -> 16 byte boundary +; @llvm.amdgcn.module.lds -> 16 byte boundary +; +; Sorted order should be: +; [@llvm.amdgcn.module.lds, @lds.size.333.align.16, @lds.size.222.align.16, @lds.size.111.align.16] +; +; Memory allocated in the sorted order should be: +; 16 + 333 + [pad 3] + 222 + [pad 2] + 111 [= 687 bytes] + +@lds.size.111.align.16 = internal unnamed_addr addrspace(3) global [111 x i8] undef, align 16 +@lds.size.222.align.16 = internal unnamed_addr addrspace(3) global [222 x i8] undef, align 16 +@lds.size.333.align.16 = internal unnamed_addr addrspace(3) global [333 x i8] undef, align 16 +@llvm.amdgcn.module.lds = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_0: +; HSA: v_mov_b32_e32 v0, 0 +; HSA: v_mov_b32_e32 v1, 3 +; HSA: ds_write_b8 v0, v1 offset:576 +; HSA: v_mov_b32_e32 v1, 4 +; HSA: ds_write_b8 v0, v1 offset:352 +; HSA: v_mov_b32_e32 v1, 5 +; HSA: ds_write_b8 v0, v1 offset:16 +; HSA: v_mov_b32_e32 v1, 6 +; HSA: ds_write_b8 v0, v1 +; HSA: s_endpgm + +; LDS-USED: amdhsa_group_segment_fixed_size 687 +define amdgpu_kernel void @fix_alignment_0() { + %lds.size.111.align.16.bc = bitcast [111 x i8] addrspace(3)* @lds.size.111.align.16 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.111.align.16.bc, align 16 + + %lds.size.222.align.16.bc = bitcast [222 x i8] addrspace(3)* @lds.size.222.align.16 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.222.align.16.bc, align 16 + + %lds.size.333.align.16.bc = bitcast [333 x i8] addrspace(3)* @lds.size.333.align.16 to i8 addrspace(3)* + store i8 5, i8 addrspace(3)* %lds.size.333.align.16.bc, align 16 + + %llvm.amdgcn.module.lds.bc = bitcast [16 x i8] addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)* + store i8 6, i8 addrspace(3)* %llvm.amdgcn.module.lds.bc, align 16 + + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -41,11 +41,11 @@ ; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 -; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]] ; GCN: s_barrier -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 28, [[ADDRW]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -11,13 +11,17 @@ ; The one with the suboptimal order resulting in extra padding exceeds ; the desired limit -; The padding estimate heuristic used by the promote alloca pass -; is mostly determined by the order of the globals, - -; Raw usage = 1060 bytes -; Rounded usage: -; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072 -; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060 +; The padding estimate heuristic used by the promote alloca pass is mostly +; determined based on following allocation order: +; +; 1. First, alignment is fixed, even for missing alignment or underaligned +; cases since we know the size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, +; on ties, based on size. +; 3. Finally, memory is allocated in the above sorted order. + +; Raw usage: 512 + 292 + 256 = 1060 bytes +; Rounded usage: 512 + (0 pad) + 292 + (12 pad) + 256 = 1072 bytes ; At default occupancy guess of 7, 2340 bytes available total. @@ -31,7 +35,7 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_0: -; GCN: workgroup_group_segment_byte_size = 1060 +; GCN: workgroup_group_segment_byte_size = 1072 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -10,7 +10,7 @@ ; CHECK: machineFunctionInfo: ; CHECK-NEXT: explicitKernArgSize: 128 ; CHECK-NEXT: maxKernArgAlign: 64 -; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: ldsSize: 2048 ; CHECK-NEXT: dynLDSAlign: 1 ; CHECK-NEXT: isEntryFunction: true ; CHECK-NEXT: noSignedZerosFPMath: false