diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -446,7 +446,7 @@ OutStreamer->SwitchSection(ConfigSection); } - if (MFI->isEntryFunction()) { + if (MFI->isModuleEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); } else { auto I = CallGraphResourceInfo.insert( @@ -459,7 +459,7 @@ if (STM.isAmdPalOS()) { if (MFI->isEntryFunction()) EmitPALMetadata(MF, CurrentProgramInfo); - else + else if (MFI->isModuleEntryFunction()) emitPALFunctionMetadata(MF); } else if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, CurrentProgramInfo); @@ -922,7 +922,22 @@ = TII->getNamedOperand(MI, AMDGPU::OpName::callee); const Function *Callee = getCalleeFunction(*CalleeOp); - if (!Callee || Callee->isDeclaration()) { + DenseMap::const_iterator I = + CallGraphResourceInfo.end(); + bool IsExternal = !Callee || Callee->isDeclaration(); + if (!IsExternal) + I = CallGraphResourceInfo.find(Callee); + + if (IsExternal || I == CallGraphResourceInfo.end()) { + // Avoid crashing on undefined behavior with an illegal call to a + // kernel. If a callsite's calling convention doesn't match the + // function's, it's undefined behavior. If the callsite calling + // convention does match, that would have errored earlier. + // FIXME: The verifier shouldn't allow this. + if (!IsExternal && + AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) + report_fatal_error("invalid call to entry function"); + // If this is a call to an external function, we can't do much. Make // conservative guesses. @@ -943,19 +958,6 @@ // We force CodeGen to run in SCC order, so the callee's register // usage etc. should be the cumulative usage of all callees. - auto I = CallGraphResourceInfo.find(Callee); - if (I == CallGraphResourceInfo.end()) { - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - // FIXME: The verifier shouldn't allow this. - if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - - llvm_unreachable("callee should have been handled before caller"); - } - MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); @@ -1266,7 +1268,11 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { auto *MD = getTargetStreamer()->getPALMetadata(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - MD->setStackFrameSize(MF, MFI.getStackSize()); + MD->setFunctionScratchSize(MF, MFI.getStackSize()); + // Set compute registers + MD->setRsrc1(CallingConv::AMDGPU_CS, + CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS)); + MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2); } // This is supposed to be log2(Size) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1301,7 +1301,7 @@ if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { - if (!MFI->isEntryFunction()) { + if (!MFI->isModuleEntryFunction()) { SDLoc DL(Op); const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadLDSDecl( diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2260,7 +2260,7 @@ SIMachineFunctionInfo *MFI = MF.getInfo(); if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - if (!MFI->isEntryFunction()) { + if (!MFI->isModuleEntryFunction()) { const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported BadLDSDecl( Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -44,10 +44,13 @@ // State of MODE register, assumed FP mode. AMDGPU::SIModeRegisterDefaults Mode; - // Kernels + shaders. i.e. functions called by the driver and not called + // Kernels + shaders. i.e. functions called by the hardware and not called // by other functions. bool IsEntryFunction = false; + // Entry points called by other functions instead of directly by the hardware. + bool IsModuleEntryFunction = false; + bool NoSignedZerosFPMath = false; // Function may be memory bound. @@ -77,6 +80,8 @@ return IsEntryFunction; } + bool isModuleEntryFunction() const { return IsModuleEntryFunction; } + bool hasNoSignedZerosFPMath() const { return NoSignedZerosFPMath; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -13,11 +13,13 @@ using namespace llvm; -AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : - MachineFunctionInfo(), - Mode(MF.getFunction()), - IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), - NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { +AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) + : MachineFunctionInfo(), Mode(MF.getFunction()), + IsEntryFunction( + AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), + IsModuleEntryFunction( + AMDGPU::isModuleEntryFunctionCC(MF.getFunction().getCallingConv())), + NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -576,6 +576,15 @@ LLVM_READNONE bool isEntryFunctionCC(CallingConv::ID CC); +// These functions are considered entrypoints into the current module, i.e. they +// are allowed to be called from outside the current module. This is different +// from isEntryFunctionCC, which is only true for functions that are entered by +// the hardware. Module entry points include all entry functions but also +// include functions that can be called from other functions inside or outside +// the current module. Module entry functions are allowed to allocate LDS. +LLVM_READNONE +bool isModuleEntryFunctionCC(CallingConv::ID CC); + // FIXME: Remove this when calling conventions cleaned up LLVM_READNONE inline bool isKernel(CallingConv::ID CC) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1068,6 +1068,15 @@ } } +bool isModuleEntryFunctionCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_Gfx: + return true; + default: + return isEntryFunctionCC(CC); + } +} + bool hasXNACK(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureXNACK]; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -79,7 +79,7 @@ void setScratchSize(unsigned CC, unsigned Val); // Set the stack frame size of a function in the metadata. - void setStackFrameSize(const MachineFunction &MF, unsigned Val); + void setFunctionScratchSize(const MachineFunction &MF, unsigned Val); // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. @@ -130,6 +130,9 @@ // Get (create if necessary) the shader functions map. msgpack::MapDocNode getShaderFunctions(); + // Get (create if necessary) a function in the shader functions map. + msgpack::MapDocNode getShaderFunction(StringRef Name); + // Get (create if necessary) the .hardware_stages entry for the given calling // convention. msgpack::MapDocNode getHwStage(unsigned CC); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -238,12 +238,11 @@ getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val); } -// Set the scratch size in the metadata. -void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF, - unsigned Val) { - auto Node = MsgPackDoc.getMapNode(); +// Set the stack frame size of a function in the metadata. +void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF, + unsigned Val) { + auto Node = getShaderFunction(MF.getFunction().getName()); Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val); - getShaderFunctions()[MF.getFunction().getName()] = Node; } // Set the hardware register bit in PAL metadata to enable wave32 on the @@ -747,6 +746,12 @@ return ShaderFunctions.getMap(); } +// Get (create if necessary) a function in the shader functions map. +msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunction(StringRef Name) { + auto Functions = getShaderFunctions(); + return Functions[Name].getMap(/*Convert=*/true); +} + // Return the PAL metadata hardware shader stage name. static const char *getStageName(CallingConv::ID CC) { switch (CC) { diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -1,5 +1,4 @@ ; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s ; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL -enable-var-scope %s @@ -126,10 +125,29 @@ ret float %add } +@lds = internal addrspace(3) global [64 x float] undef + +define amdgpu_gfx float @simple_lds(float %arg0) #0 { + %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 + %val = load float, float addrspace(3)* %lds_ptr + ret float %val +} + +define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 { + %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 + %val = load float, float addrspace(3)* %lds_ptr + %res = call amdgpu_gfx float @simple_lds_recurse(float %val) + ret float %res +} + attributes #0 = { nounwind } ; GCN: amdpal.pipelines: -; GCN-NEXT: - .registers: {} +; GCN-NEXT: - .registers: +; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} +; SDAG-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} +; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01cf{{$}} +; GISEL-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} @@ -148,6 +166,10 @@ ; GISEL-NEXT: .stack_frame_size_in_bytes: 0xd0{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} +; GCN-NEXT: simple_lds: +; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}} +; GCN-NEXT: simple_lds_recurse: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; GCN-NEXT: simple_stack: ; GCN-NEXT: .stack_frame_size_in_bytes: 0x14{{$}} ; GCN-NEXT: simple_stack_call: