Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -201,11 +201,14 @@ } bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { - bool HasApertureRegs = TM->getSubtarget(F).hasApertureRegs(); + const AMDGPUSubtarget &ST = TM->getSubtarget(F); + bool HasFlat = ST.hasFlatAddressSpace(); + bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet ConstantExprVisited; bool Changed = false; bool NeedQueuePtr = false; + bool HaveCall = false; bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); for (BasicBlock &BB : F) { @@ -215,11 +218,15 @@ Function *Callee = CS.getCalledFunction(); // TODO: Do something with indirect calls. - if (!Callee) + if (!Callee) { + if (!CS.isInlineAsm()) + HaveCall = true; continue; + } Intrinsic::ID IID = Callee->getIntrinsicID(); if (IID == Intrinsic::not_intrinsic) { + HaveCall = true; copyFeaturesToFunction(F, *Callee, NeedQueuePtr); Changed = true; } else { @@ -261,6 +268,14 @@ Changed = true; } + // TODO: We could refine this to captured pointers that could possibly be + // accessed by flat instructions. For now this is mostly a poor way of + // estimating whether there are calls before argument lowering. + if (HasFlat && !IsFunc && HaveCall) { + F.addFnAttr("amdgpu-flat-scratch"); + Changed = true; + } + return Changed; } Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -246,7 +246,7 @@ // this point it appears we need the setup. This part of the prolog should be // emitted after frame indices are eliminated. - if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) + if (MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); unsigned SPReg = MFI->getStackPtrOffsetReg(); @@ -254,7 +254,8 @@ assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); DebugLoc DL; - int64_t StackSize = MF.getFrameInfo().getStackSize(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + int64_t StackSize = FrameInfo.getStackSize(); if (StackSize == 0) { BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -152,7 +152,8 @@ } } - if (ST.isAmdCodeObjectV2(MF)) { + bool IsCOV2 = ST.isAmdCodeObjectV2(MF); + if (IsCOV2) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; @@ -172,12 +173,12 @@ if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr")) KernargSegmentPtr = true; - // We don't need to worry about accessing spills with flat instructions. - // TODO: On VI where we must use flat for global, we should be able to omit - // this if it is never used for generic access. - if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS() && - isEntryFunction()) - FlatScratchInit = true; + if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) { + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls that may require it before argument lowering. + if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch")) + FlatScratchInit = true; + } } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -250,9 +250,48 @@ ret void } +; HSA: declare void @external.func() #15 +declare void @external.func() #3 + +; HSA: define internal void @defined.func() #15 { +define internal void @defined.func() #3 { + ret void +} + +; HSA: define void @func_call_external() #15 { +define void @func_call_external() #3 { + call void @external.func() + ret void +} + +; HSA: define void @func_call_defined() #15 { +define void @func_call_defined() #3 { + call void @defined.func() + ret void +} + +; HSA: define void @func_call_asm() #15 { +define void @func_call_asm() #3 { + call void asm sideeffect "", ""() #3 + ret void +} + +; HSA: define amdgpu_kernel void @kern_call_external() #16 { +define amdgpu_kernel void @kern_call_external() #3 { + call void @external.func() + ret void +} + +; HSA: define amdgpu_kernel void @func_kern_defined() #16 { +define amdgpu_kernel void @func_kern_defined() #3 { + call void @defined.func() + ret void +} + attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind "target-cpu"="fiji" } attributes #2 = { nounwind "target-cpu"="gfx900" } +attributes #3 = { nounwind } ; HSA: attributes #0 = { nounwind readnone speculatable } ; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" } @@ -269,3 +308,5 @@ ; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" } ; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" } ; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" } +; HSA: attributes #15 = { nounwind } +; HSA: attributes #16 = { nounwind "amdgpu-flat-scratch" }