Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -279,6 +279,7 @@ bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet ConstantExprVisited; + bool HaveStackObjects = false; bool Changed = false; bool NeedQueuePtr = false; bool HaveCall = false; @@ -286,6 +287,11 @@ for (BasicBlock &BB : F) { for (Instruction &I : BB) { + if (isa(I)) { + HaveStackObjects = true; + continue; + } + if (auto *CB = dyn_cast(&I)) { const Function *Callee = dyn_cast(CB->getCalledOperand()->stripPointerCasts()); @@ -355,6 +361,11 @@ Changed = true; } + if (HaveStackObjects) { + F.addFnAttr("amdgpu-stack-objects"); + Changed = true; + } + return Changed; } Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -55,11 +55,10 @@ Occupancy = ST.computeOccupancy(MF, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); // FIXME: Should have analysis or something rather than attribute to detect // calls. - const bool HasCalls = FrameInfo.hasCalls() || F.hasFnAttribute("amdgpu-calls"); + const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't // have any calls. @@ -125,8 +124,7 @@ WorkItemIDZ = true; } - bool HasStackObjects = FrameInfo.hasStackObjects(); - + bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); if (isEntryFunction()) { // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. @@ -170,20 +168,10 @@ KernargSegmentPtr = true; if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) { - auto hasNonSpillStackObjects = [&]() { - // Avoid expensive checking if there's no stack objects. - if (!HasStackObjects) - return false; - for (auto OI = FrameInfo.getObjectIndexBegin(), - OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI) - if (!FrameInfo.isSpillSlotObjectIndex(OI)) - return true; - // All stack objects are spill slots. - return false; - }; // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls that may require it before argument lowering. - if (HasCalls || hasNonSpillStackObjects()) + // detecting calls or stack objects that may require it before argument + // lowering. + if (HasCalls || HasStackObjects) FlatScratchInit = true; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s + +; Make sure flat_scratch_init is set + +; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls: +; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 +define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { + %alloca = alloca i32, addrspace(5) + %cast = addrspacecast i32 addrspace(5)* %alloca to i32* + store volatile i32 0, i32* %cast + ret void +} + +; TODO: Could optimize out in this case +; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls: +; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 +define amdgpu_kernel void @stack_object_in_kernel_no_calls() { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + ret void +} + +; GCN-LABEL: {{^}}kernel_no_calls_no_stack: +; GCN: .amdhsa_user_sgpr_flat_scratch_init 0 +define amdgpu_kernel void @kernel_no_calls_no_stack() { + ret void +} Index: llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 declare i32 @llvm.amdgcn.workgroup.id.z() #0 @@ -250,6 +252,31 @@ ret void } +; HSA: define amdgpu_kernel void @use_alloca() #13 { +define amdgpu_kernel void @use_alloca() #1 { + %alloca = alloca i32, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} + +; HSA: define amdgpu_kernel void @use_alloca_non_entry_block() #13 { +define amdgpu_kernel void @use_alloca_non_entry_block() #1 { +entry: + br label %bb + +bb: + %alloca = alloca i32, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} + +; HSA: define void @use_alloca_func() #13 { +define void @use_alloca_func() #1 { + %alloca = alloca i32, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} + attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } @@ -266,3 +293,4 @@ ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" } ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" } ; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" } +; HSA: attributes #13 = { nounwind "amdgpu-stack-objects" }