Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -749,41 +749,79 @@ if (LocalMemLimit == 0) return false; - const DataLayout &DL = Mod->getDataLayout(); + SmallVector Stack; + SmallPtrSet VisitedConstants; + SmallPtrSet UsedLDS; + + auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { + for (const User *U : Val->users()) { + if (const Instruction *Use = dyn_cast(U)) { + if (Use->getParent()->getParent() == &F) + return true; + } else { + const Constant *C = cast(U); + if (VisitedConstants.insert(C).second) + Stack.push_back(C); + } + } + + return false; + }; - // Check how much local memory is being used by global objects - CurrentLocalMemUsage = 0; for (GlobalVariable &GV : Mod->globals()) { if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; - for (const User *U : GV.users()) { - const Instruction *Use = dyn_cast(U); - if (!Use) { - // FIXME: This is probably a constant expression use. We should - // recursively search the users of it for the parent function instead of - // bailing. - LLVM_DEBUG(dbgs() << "Giving up on LDS size estimate " - "due to constant expression\n"); - return false; - } + if (visitUsers(&GV, &GV)) { + UsedLDS.insert(&GV); + Stack.clear(); + continue; + } - if (Use->getParent()->getParent() == &F) { - Align Alignment = - DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); - - // FIXME: Try to account for padding here. The padding is currently - // determined from the inverse order of uses in the function. I'm not - // sure if the use list order is in any way connected to this, so the - // total reported size is likely incorrect. - uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment); - CurrentLocalMemUsage += AllocSize; + // For any ConstantExpr uses, we need to recursively search the users until + // we see a function. + while (!Stack.empty()) { + const Constant *C = Stack.pop_back_val(); + if (visitUsers(&GV, C)) { + UsedLDS.insert(&GV); + Stack.clear(); break; } } } + const DataLayout &DL = Mod->getDataLayout(); + SmallVector, 16> AllocatedSizes; + AllocatedSizes.reserve(UsedLDS.size()); + + for (const GlobalVariable *GV : UsedLDS) { + Align Alignment = + DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); + uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType()); + AllocatedSizes.emplace_back(AllocSize, Alignment); + } + + // Sort to try to estimate the worst case alignment padding + // + // FIXME: We should really do something to fix the addresses to a more optimal + // value instead + std::sort(AllocatedSizes.begin(), AllocatedSizes.end(), + [](std::pair LHS, std::pair RHS) { + return LHS.second < RHS.second; + }); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + + // FIXME: Try to account for padding here. The real padding and address is + // currently determined from the inverse order of uses in the function when + // legalizing, which could also potentially change. We try to estimate the + // worst case here, but we probably should fix the addresses earlier. + for (auto Alloc : AllocatedSizes) { + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second); + CurrentLocalMemUsage += Alloc.first; + } + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F); Index: llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -1,10 +1,12 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s -; This shows that the amount of LDS estimate is sensitive to the order -; of the LDS globals. +; This shows that the amount LDS size estimate should try to not be +; sensitive to the order of the LDS globals. This should try to +; estimate the worst case padding behavior to avoid overallocating +; LDS. -; Both of these functions use the same amount of LDS, but the total -; changes depending on the visit order of first use. +; These functions use the same amount of LDS, but the total, final +; size changes depending on the visit order of first use. ; The one with the suboptimal order resulting in extra padding exceeds ; the desired limit @@ -29,7 +31,7 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_0: -; GCN: workgroup_group_segment_byte_size = 2340 +; GCN: workgroup_group_segment_byte_size = 1060 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) @@ -61,7 +63,7 @@ } ; GCN-LABEL: {{^}}promote_alloca_size_order_1: -; GCN: workgroup_group_segment_byte_size = 2352 +; GCN: workgroup_group_segment_byte_size = 1072 define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -4,16 +4,20 @@ target datalayout = "A5" @all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4 +@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4 + +@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4 +@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4 ; This function cannot promote to using LDS because of the size of the ; constant expression use in the function, which was previously not ; detected. -; IR-LABEL: @constant_expression_uses_lds( +; IR-LABEL: @constant_expression_uses_all_lds( ; IR: alloca -; ASM-LABEL: constant_expression_uses_lds: -; ASM: .group_segment_fixed_size: 65536 -define amdgpu_kernel void @constant_expression_uses_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +; ASM-LABEL: constant_expression_uses_all_lds: +; ASM: .amdhsa_group_segment_fixed_size 65536 +define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 @@ -32,4 +36,130 @@ ret void } -attributes #0 = { "amdgpu-waves-per-eu"="1,5" } +; Has a constant expression use through a single level of constant +; expression, but not enough LDS to block promotion + +; IR-LABEL: @constant_expression_uses_some_lds( +; IR-NOT: alloca + +; ASM-LABEL: {{^}}constant_expression_uses_some_lds: +; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} +define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef + ret void +} + +declare void @callee(i8*) + +; IR-LABEL: @constant_expression_uses_all_lds_multi_level( +; IR: alloca + +; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level: +; ASM: .amdhsa_group_segment_fixed_size 65536{{$}} +define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*)) + ret void +} + +; IR-LABEL: @constant_expression_uses_some_lds_multi_level( +; IR-NOT: alloca +; IR: llvm.amdgcn.workitem.id + +; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level: +; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} +define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*)) + ret void +} + +; IR-LABEL: @constant_expression_uses_some_lds_global_initializer( +; IR-NOT: alloca +; IR: llvm.amdgcn.workitem.id + +; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer: +; ASM: .amdhsa_group_segment_fixed_size 4096{{$}} +define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + + store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef + ret void +} + +; We can't actually handle LDS initializers in global initializers, +; but this should count as usage. + +; IR-LABEL: @constant_expression_uses_all_lds_global_initializer( +; IR: alloca + +; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer: +; ASM: .group_segment_fixed_size: 65536 +define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" }