Index: llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -46,8 +46,8 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/OptimizedStructLayout.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include #include #define DEBUG_TYPE "amdgpu-lower-module-lds" @@ -210,35 +210,25 @@ } } - // Sort by alignment, descending, to minimise padding. - // On ties, sort by size, descending, then by name, lexicographical. - llvm::stable_sort( - FoundLocalVars, - [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { - Align ALHS = AMDGPU::getAlign(DL, LHS); - Align ARHS = AMDGPU::getAlign(DL, RHS); - if (ALHS != ARHS) { - return ALHS > ARHS; - } - - TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); - TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); - if (SLHS != SRHS) { - return SLHS > SRHS; - } - - // By variable name on tie for predictable order in test cases. - return LHS->getName() < RHS->getName(); - }); + SmallVector LayoutFields; + LayoutFields.reserve(FoundLocalVars.size()); + for (GlobalVariable *GV : FoundLocalVars) { + OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()), + AMDGPU::getAlign(DL, GV)); + LayoutFields.emplace_back(F); + } + + performOptimizedStructLayout(LayoutFields); std::vector LocalVars; LocalVars.reserve(FoundLocalVars.size()); // will be at least this large { // This usually won't need to insert any padding, perhaps avoid the alloc uint64_t CurrentOffset = 0; - for (size_t I = 0; I < FoundLocalVars.size(); I++) { - GlobalVariable *FGV = FoundLocalVars[I]; - Align DataAlign = AMDGPU::getAlign(DL, FGV); + for (size_t I = 0; I < LayoutFields.size(); I++) { + GlobalVariable *FGV = static_cast( + const_cast(LayoutFields[I].Id)); + Align DataAlign = LayoutFields[I].Alignment; uint64_t DataAlignV = DataAlign.value(); if (uint64_t Rem = CurrentOffset % DataAlignV) { @@ -257,7 +247,7 @@ } LocalVars.push_back(FGV); - CurrentOffset += DL.getTypeAllocSize(FGV->getValueType()); + CurrentOffset += LayoutFields[I].Size; } } @@ -272,14 +262,14 @@ : "llvm.amdgcn.module.lds"); StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); - Align MaxAlign = - AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment + Align StructAlign = + AMDGPU::getAlign(DL, LocalVars[0]); GlobalVariable *SGV = new GlobalVariable( M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); - SGV->setAlignment(MaxAlign); + SGV->setAlignment(StructAlign); if (!F) { appendToCompilerUsed( M, {static_cast( Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -20,7 +20,7 @@ ; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4 ; CHECK-LABEL: @get_func() -; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 define i32 @get_func() local_unnamed_addr #0 { entry: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 @@ -28,7 +28,7 @@ } ; CHECK-LABEL: @set_func(i32 %x) -; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 define void @set_func(i32 %x) local_unnamed_addr #1 { entry: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 @@ -40,14 +40,14 @@ ; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* ; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32* ; CHECK: %3 = ptrtoint i32* %2 to i64 -; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), %3 +; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), %3 ; CHECK: %5 = inttoptr i64 %4 to i32* ; CHECK: %ld = load i32, i32* %5, align 4 ; CHECK: %mul = mul i32 %ld, 2 ; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* ; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32* ; CHECK: %8 = ptrtoint i32* %7 to i64 -; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64) +; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64) ; CHECK: %10 = inttoptr i64 %9 to i32* ; CHECK: store i32 %mul, i32* %10, align 4 define amdgpu_kernel void @timestwo() { Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll @@ -16,7 +16,7 @@ ; But none of them are used anywhere. Hence, @lds.6 is not lowered. ;. -; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8] } +; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8] } ; CHECK-NOT: @lds.1 ; CHECK-NOT: @lds.2 @@ -41,7 +41,7 @@ ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4 ; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata" -; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 4) +; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) ; CHECK: @alias.to.lds.2 = alias [2 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) ; CHECK: @alias.to.gptr.3 = alias i64*, i64* addrspace(1)* @gptr.3 ; CHECK: @alias.to.gptr.4 = alias i64*, i64* addrspace(1)* @gptr.4 Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll @@ -17,7 +17,7 @@ ; and @gptr.8 is used within non-kernel function @f1. Hence @lds.7 is lowered. ;. -; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] } +; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [1 x float], [2 x float] } ; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2 ; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 @@ -36,7 +36,7 @@ ; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8 ; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8 -; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) to i64 addrspace(3)*) to i64*), align 8 +; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i64 addrspace(3)*) to i64*), align 8 ; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([2 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) to i64 addrspace(3)*) to i64*), align 8 ; CHECK: @gptr.7 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8 ; CHECK: @gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8 Index: llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll +++ llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll @@ -5,10 +5,10 @@ ; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8] } ; Different properly aligned values, but same size of 1. -; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [7 x i8], [1 x i8], [3 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8] } +; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8], [3 x i8], [1 x i8] } ; All are under-aligned, requires to fix each on different alignment boundary. -; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8] } +; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [1 x i8], [2 x i8], [3 x i8], [1 x i8], [5 x i8] } ; All LDS are underaligned, requires to allocate on 8 byte boundary ; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8] }