diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h --- a/llvm/include/llvm/Support/Alignment.h +++ b/llvm/include/llvm/Support/Alignment.h @@ -345,6 +345,10 @@ return Rhs && *Rhs > Lhs ? *Rhs : Lhs; } +inline Align max(Align Lhs, Align Rhs) { + return Lhs > Rhs ? Lhs : Rhs; +} + #ifndef NDEBUG // For usage in LLVM_DEBUG macros. inline std::string DebugStr(const Align &A) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -9,11 +9,128 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPULDSUtils.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; +// Fix natural alignment for LDS global `GV` based on its size. +static Align fixAlignment(const DataLayout &DL, const GlobalVariable *GV, + Align Alignment) { + TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = max(Alignment, Align(2)); + } + + return Alignment; +} + +// Check if the kernel K uses the LDS global GV. +static bool hasKernelUsesLDS(const GlobalVariable *GV, const Function *K) { + SmallVector UserStack(GV->users()); + SmallPtrSet VisitedUsers; + + // Following are the possibilities of GV uses: + // 1. GV is directly used within an instruction as one of its operand. + // 2. GV is used within a constant expression, and this constant expression + // itself is used either within an instruction or as an initiializer to + // a global variable. Though later case is rare, nevertheless, we need to + // check for it. + while (!UserStack.empty()) { + const auto *UU = UserStack.pop_back_val(); + + // No need to handle already visited users. + if (!VisitedUsers.insert(UU).second) + continue; + + // Ignore possible, but, rare uses as in below example. + // @GV = addrspace(3) global int undef + // @GV2 = + // addrspace(1) global int* addrspacecast(int addrspace(3)* @GV to int*) + if (isa(UU)) + continue; + + // Recursively traverse through constant expressions. + if (isa(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + // At this point, user should be an instruction. Check if this instruction + // is from the kernel K, if so, return true. + const auto *I = dyn_cast(UU); + assert(I && "Expected an instruction"); + if (I->getFunction() == K) + return true; + } + + // GV is not used within kernel K. return false. + return false; +} + +SmallVector +getSortedUsedLDSGlobals(const Function *K) { + const Module *M = K->getParent(); + const DataLayout &DL = M->getDataLayout(); + + // Collect all stastic LDS globals defined within the module. + SmallVector LDSGlobals = + AMDGPU::collectStaticLDSGlobals(M); + + // Collect all those stastic LDS globals which are *used* within kernel K. + SmallVector UsedLDSGlobals; + for (const auto *GV : LDSGlobals) { + if (hasKernelUsesLDS(GV, K)) + UsedLDSGlobals.push_back(GV); + } + + // Sort LDS globals (which are used within kernel K) by alignment, descending, + // on ties, by size, descending, on ties, by name, lexicographical. + llvm::stable_sort( + UsedLDSGlobals, + [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { + Align ALHS = fixAlignment(DL, LHS, AMDGPU::getAlign(DL, LHS)); + Align ARHS = fixAlignment(DL, RHS, AMDGPU::getAlign(DL, RHS)); + if (ALHS != ARHS) { + return ALHS > ARHS; + } + + TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); + TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); + if (SLHS != SRHS) { + return SLHS > SRHS; + } + + return LHS->getName() < RHS->getName(); + }); + + // Module LDS which is possibly created by the "Lower Module LDS" pass, should + // be allocated at address 0, irrespective of its size and alignment. + SmallVector SortedLDSGlobals; + GlobalVariable *ModuleLDS = M->getGlobalVariable("llvm.amdgcn.module.lds"); + if (ModuleLDS) + SortedLDSGlobals.push_back(ModuleLDS); + + for (const auto *GV : UsedLDSGlobals) { + if (GV != ModuleLDS) + SortedLDSGlobals.push_back(GV); + } + + return SortedLDSGlobals; +} + AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), Mode(MF.getFunction()), IsEntryFunction( @@ -36,6 +153,15 @@ CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); + + if (IsModuleEntryFunction) { + SmallVector SortedLDSGlobals = + getSortedUsedLDSGlobals(&F); + + const DataLayout &DL = F.getParent()->getDataLayout(); + for (const auto *GV : SortedLDSGlobals) + allocateLDSGlobal(DL, *GV); + } } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, @@ -47,9 +173,9 @@ Align Alignment = DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); - /// TODO: We should sort these to minimize wasted space due to alignment - /// padding. Currently the padding is decided by the first encountered use - /// during lowering. + // Make sure that natural alignment of `GV` is correctly fixed. + Alignment = fixAlignment(DL, &GV, Alignment); + unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); Entry.first->second = Offset; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -31,6 +31,8 @@ SmallPtrSet getUsedList(Module &M); +SmallVector collectStaticLDSGlobals(const Module *M); + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -29,6 +29,42 @@ GV->getValueType()); } +SmallVector +collectStaticLDSGlobals(const Module *M) { + SmallVector StaticLDSGlobals; + + for (auto &GV : M->globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + continue; + } + + if (!GV.hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another, in which case this transform + // is not required + continue; + } + + if (!isa(GV.getInitializer())) { + // Initializers are unimplemented for local address space. + // Leave such variables in place for consistent error reporting. + continue; + } + + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + continue; + } + + StaticLDSGlobals.push_back(&GV); + } + + return StaticLDSGlobals; +} + bool userRequiresLowering(const SmallPtrSetImpl &UsedList, User *InitialUser) { // Any LDS variable can be lowered by moving into the created struct @@ -75,36 +111,23 @@ std::vector findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList) { + SmallVector StaticLDSGlobals = + collectStaticLDSGlobals(&M); + std::vector LocalVars; - for (auto &GV : M.globals()) { - if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { - continue; - } - if (!GV.hasInitializer()) { - // addrspace(3) without initializer implies cuda/hip extern __shared__ - // the semantics for such a variable appears to be that all extern - // __shared__ variables alias one another, in which case this transform - // is not required - continue; - } - if (!isa(GV.getInitializer())) { - // Initializers are unimplemented for local address space. - // Leave such variables in place for consistent error reporting. - continue; - } - if (GV.isConstant()) { - // A constant undef variable can't be written to, and any load is - // undef, so it should be eliminated by the optimizer. It could be - // dropped by the back end if not. This pass skips over it. - continue; - } - if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) { + + for (auto *GV : StaticLDSGlobals) { + GlobalVariable *GV2 = const_cast(GV); + + if (std::none_of(GV2->user_begin(), GV2->user_end(), [&](User *U) { return userRequiresLowering(UsedList, U); })) { continue; } - LocalVars.push_back(&GV); + + LocalVars.push_back(GV2); } + return LocalVars; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v3, v0 offset:4 +; CHECK-NEXT: ds_read_b32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 @@ -19,7 +19,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-NEXT: flat_store_dword v[0:1], v3 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x200 ; CHECK-NEXT: ds_write_b32 v0, v2 ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll @@ -1,5 +1,11 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 < %s | FileCheck -check-prefix=HSA %s +; LDS allocation is now done in the sorted order: +; 1. First, alignment is fixed, even for missing alignment or underaligned cases since we know the +; size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties, based on size. +; 3. Finally, memory is allocated in the above sorted order. + @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 @@ -25,7 +31,7 @@ ; There are two objects, so one requires padding to be correctly ; aligned after the other. -; (38 -> 48) + 38 = 92 +; 38 + (10 pad) + 38 = 86 ; I don't think it is necessary to add padding after since if there ; were to be a dynamically sized LDS kernel arg, the runtime should @@ -94,9 +100,9 @@ ret void } -; (7 * 8) + (39 * 4) = 212 +; 156 + (4 pad) + 56 = 216 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: -; HSA: workgroup_group_segment_byte_size = 212 +; HSA: workgroup_group_segment_byte_size = 216 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* @@ -110,7 +116,7 @@ ret void } -; (39 * 4) + (4 pad) + (7 * 8) = 216 +; 156 + (4 pad) + 56 = 216 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: ; HSA: workgroup_group_segment_byte_size = 216 ; HSA: group_segment_alignment = 4 @@ -158,7 +164,7 @@ ret void } -; align 32, 8, 16 +; align 32, 16, 8 ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 ; HSA-LABEL: {{^}}test_round_size_3_order1: ; HSA: workgroup_group_segment_byte_size = 134 @@ -179,10 +185,10 @@ ret void } -; align 16, 32, 8 -; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 +; align 32, 16, 8 +; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 ; HSA-LABEL: {{^}}test_round_size_3_order2: -; HSA: workgroup_group_segment_byte_size = 150 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -200,10 +206,10 @@ ret void } -; align 16, 8, 32 +; align 32, 16, 8 ; 38 + (2 pad) + 38 + (2 pad) + 38 ; HSA-LABEL: {{^}}test_round_size_3_order3: -; HSA: workgroup_group_segment_byte_size = 118 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -221,10 +227,10 @@ ret void } -; align 8, 32, 16 -; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 +; align 32, 16, 8 +; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 ; HSA-LABEL: {{^}}test_round_size_3_order4: -; HSA: workgroup_group_segment_byte_size = 142 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* @@ -242,10 +248,10 @@ ret void } -; align 8, 16, 32 -; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 +; align 32, 16, 8 +; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 ; HSA-LABEL: {{^}}test_round_size_3_order5: -; HSA: workgroup_group_segment_byte_size = 126 +; HSA: workgroup_group_segment_byte_size = 134 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* diff --git a/llvm/test/CodeGen/AMDGPU/lds-allocation.ll b/llvm/test/CodeGen/AMDGPU/lds-allocation.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-allocation.ll @@ -0,0 +1,251 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 < %s | FileCheck -check-prefix=HSA %s + +; LDS allocation is now done in the sorted order as below: +; +; 1. First, alignment is fixed, even for missing alignment or underaligned cases since +; we know the size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties, +; based on size. +; 3. Finally, memory is allocated in the above sorted order. + + +; [TEST 0] +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.16.align.16 -> 16 byte boundary +; @lds.size.8.align.8 -> 8 byte boundary +; @lds.size.4.align.4 -> 4 byte boundary +; @lds.size.2.align.2 -> 2 byte boundary +; @lds.size.1.align.1 -> 1 byte boundary (any boundary) +; +; Sorted order based on alignment should be: +; [@lds.size.16.align.16, @lds.size.8.align.8, @lds.size.4.align.4, @lds.size.2.align.2, @lds.size.1.align.1] +; +; Memory allocated in the sorted order should be: +; 16 + 8 + 4 + 2 + 1 [= 31 bytes] + +@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2 +@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4 +@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 +@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_0: +; HSA: amdhsa_group_segment_fixed_size 31 +define amdgpu_kernel void @fix_alignment_0() { + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 + + %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 + + %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.8.align.8.bc, align 8 + + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + + ret void +} + + +; [TEST 1] +; +; Fixed alignments for allocation should be: +; @lds.size.9.align.8 -> 16 byte boundary +; @lds.size.5.align.4 -> 8 byte boundary +; @lds.size.3.align.2 -> 4 byte boundary +; @lds.size.2.align.1 -> 2 byte boundary +; +; Sorted order based on *fixed* alignment should be: +; [@lds.size.9.align.8, lds.size.5.align.4, lds.size.3.align.2, lds.size.2.align.1] +; +; Memory allocated in the sorted order should be: +; 9 + (pad 7) + 5 + (pad 3) + 3 + (pad 1) + 2 [= 30 bytes] + +@lds.size.2.align.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 +@lds.size.3.align.2 = internal unnamed_addr addrspace(3) global [3 x i8] undef, align 2 +@lds.size.5.align.4 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 4 +@lds.size.9.align.8 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 8 + +; HSA-LABEL: {{^}}fix_alignment_1: +; HSA: amdhsa_group_segment_fixed_size 30 +define amdgpu_kernel void @fix_alignment_1() { + %lds.size.2.align.1.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.1 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.2.align.1.bc, align 1 + + %lds.size.3.align.2.bc = bitcast [3 x i8] addrspace(3)* @lds.size.3.align.2 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.3.align.2.bc, align 2 + + %lds.size.5.align.4.bc = bitcast [5 x i8] addrspace(3)* @lds.size.5.align.4 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.5.align.4.bc, align 4 + + %lds.size.9.align.8.bc = bitcast [9 x i8] addrspace(3)* @lds.size.9.align.8 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.9.align.8.bc, align 8 + + ret void +} + + +; [TEST 2] +; +; Fixed alignments for allocation should be: +; @lds.size.9.align.1 -> 16 byte boundary +; @lds.size.10.align.2 -> 16 byte boundary +; @lds.size.11.align.4 -> 16 byte boundary +; @lds.size.12.align.8 -> 16 byte boundary +; +; Sorted order based on *fixed* alignment and then by size should be: +; [@lds.size.12.align.8, @lds.size.11.align.4, @lds.size.10.align.2, @lds.size.9.align.1] +; +; Memory allocated in the sorted order should be: +; 12 + (pad 4) + 11 + (pad 5) + 10 + (pad 6) + 9 [= 57 bytes] + +@lds.size.9.align.1 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 1 +@lds.size.10.align.2 = internal unnamed_addr addrspace(3) global [10 x i8] undef, align 2 +@lds.size.11.align.4 = internal unnamed_addr addrspace(3) global [11 x i8] undef, align 4 +@lds.size.12.align.8 = internal unnamed_addr addrspace(3) global [12 x i8] undef, align 8 + +; HSA-LABEL: {{^}}fix_alignment_2: +; HSA: amdhsa_group_segment_fixed_size 57 +define amdgpu_kernel void @fix_alignment_2() { + %lds.size.9.align.1.bc = bitcast [9 x i8] addrspace(3)* @lds.size.9.align.1 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.9.align.1.bc, align 1 + + %lds.size.10.align.2.bc = bitcast [10 x i8] addrspace(3)* @lds.size.10.align.2 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.10.align.2.bc, align 2 + + %lds.size.11.align.4.bc = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.4 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.11.align.4.bc, align 4 + + %lds.size.12.align.8.bc = bitcast [12 x i8] addrspace(3)* @lds.size.12.align.8 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.12.align.8.bc, align 8 + + ret void +} + + +; [TEST 3] +; +; Fixed alignments for allocation should be: +; @lds.size.5.align.2 -> 8 byte boundary +; @lds.size.6.align.2 -> 8 byte boundary +; @lds.size.7.align.2 -> 8 byte boundary +; @lds.size.7.align.4 -> 8 byte boundary +; +; Sorted order based on *fixed* alignment and then by size should be: +; [@lds.size.7.align.4, @lds.size.7.align.2, @lds.size.6.align.2, @lds.size.5.align.2] +; OR +; [@lds.size.7.align.2, @lds.size.7.align.4, @lds.size.6.align.2, @lds.size.5.align.2] +; +; Memory allocated in the sorted order should be: +; 7 + (pad 1) + 7 + (pad 1) + 6 + (pad 2) + 5 [= 29 bytes] + +@lds.size.5.align.2 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 2 +@lds.size.6.align.2 = internal unnamed_addr addrspace(3) global [6 x i8] undef, align 2 +@lds.size.7.align.2 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 2 +@lds.size.7.align.4 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 4 + +; HSA-LABEL: {{^}}fix_alignment_3: +; HSA: amdhsa_group_segment_fixed_size 29 +define amdgpu_kernel void @fix_alignment_3() { + %lds.size.5.align.2.bc = bitcast [5 x i8] addrspace(3)* @lds.size.5.align.2 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.5.align.2.bc, align 2 + + %lds.size.6.align.2.bc = bitcast [6 x i8] addrspace(3)* @lds.size.6.align.2 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.6.align.2.bc, align 2 + + %lds.size.7.align.2.bc = bitcast [7 x i8] addrspace(3)* @lds.size.7.align.2 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.7.align.2.bc, align 2 + + %lds.size.7.align.4.bc = bitcast [7 x i8] addrspace(3)* @lds.size.7.align.4 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.7.align.4.bc, align 4 + + ret void +} + + +; [TEST 4] +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.17.align.16 -> 16 byte boundary +; @lds.size.18.align.16 -> 16 byte boundary +; @lds.size.19.align.16 -> 16 byte boundary +; @lds.size.20.align.16 -> 16 byte boundary +; +; Sorted order based on *size* should be: +; [@lds.size.20.align.16, @lds.size.19.align.16, @lds.size.18.align.16, @lds.size.17.align.16] +; +; Memory allocated in the sorted order should be: +; 20 + (pad 12) + 19 + (pad 13) + 18 + (pad 14) + 17 [= 113 bytes] + +@lds.size.17.align.16 = internal unnamed_addr addrspace(3) global [17 x i8] undef, align 16 +@lds.size.18.align.16 = internal unnamed_addr addrspace(3) global [18 x i8] undef, align 16 +@lds.size.19.align.16 = internal unnamed_addr addrspace(3) global [19 x i8] undef, align 16 +@lds.size.20.align.16 = internal unnamed_addr addrspace(3) global [20 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_4: +; HSA: amdhsa_group_segment_fixed_size 113 +define amdgpu_kernel void @fix_alignment_4() { + %lds.size.17.align.16.bc = bitcast [17 x i8] addrspace(3)* @lds.size.17.align.16 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.17.align.16.bc, align 16 + + %lds.size.18.align.16.bc = bitcast [18 x i8] addrspace(3)* @lds.size.18.align.16 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.18.align.16.bc, align 16 + + %lds.size.19.align.16.bc = bitcast [19 x i8] addrspace(3)* @lds.size.19.align.16 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.19.align.16.bc, align 16 + + %lds.size.20.align.16.bc = bitcast [20 x i8] addrspace(3)* @lds.size.20.align.16 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.20.align.16.bc, align 16 + + ret void +} + +; [TEST 5] +; +; No alignment fix is required for allocation as all the LDS globals are on required alignment +; boundary. That is: +; @lds.size.1.align.1 -> any boundary +; @lds.size.1.align.2 -> any boundary +; @lds.size.1.align.4 -> any boundary +; @lds.size.1.align.8 -> any boundary +; @lds.size.1.align.16 -> any boundary +; +; Sorted order based on *size* should be: +; [@size.1.align.16, @size.1.align.8, @size.1.align.4, @size.1.align.2, size.1.align.1] +; +; Memory allocated in the sorted order should be: +; 1 + [gap 7] + 1 + [gap 3] + 1 + [gap 1] + 1 + 1 [= 16 bytes] + +@size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@size.1.align.2 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 2 +@size.1.align.4 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 4 +@size.1.align.8 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 8 +@size.1.align.16 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 16 + +; HSA-LABEL: {{^}}fix_alignment_5: +; HSA: amdhsa_group_segment_fixed_size 16 +define amdgpu_kernel void @fix_alignment_5() { + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.1 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.1.align.2.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.2 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.1.align.2.bc, align 2 + + %lds.size.1.align.4.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.4 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.1.align.4.bc, align 4 + + %lds.size.1.align.8.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.8 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.1.align.8.bc, align 8 + + %lds.size.1.align.16.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.16 to i8 addrspace(3)* + store i8 0, i8 addrspace(3)* %lds.size.1.align.16.bc, align 16 + + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -41,11 +41,11 @@ ; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 -; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]] ; GCN: s_barrier -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 28, [[ADDRW]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -11,13 +11,17 @@ ; The one with the suboptimal order resulting in extra padding exceeds ; the desired limit -; The padding estimate heuristic used by the promote alloca pass -; is mostly determined by the order of the globals, - -; Raw usage = 1060 bytes -; Rounded usage: -; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072 -; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060 +; The padding estimate heuristic used by the promote alloca pass is mostly +; determined based on following allocation order: +; +; 1. First, alignment is fixed, even for missing alignment or underaligned +; cases since we know the size of LDS globals at compile time. +; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, +; on ties, based on size. +; 3. Finally, memory is allocated in the above sorted order. + +; Raw usage: 512 + 292 + 256 = 1060 bytes +; Rounded usage: 512 + (0 pad) + 292 + (12 pad) + 256 = 1072 bytes ; At default occupancy guess of 7, 2340 bytes available total. @@ -31,7 +35,7 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_0: -; GCN: workgroup_group_segment_byte_size = 1060 +; GCN: workgroup_group_segment_byte_size = 1072 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5)