diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -78,6 +78,7 @@ const SIProgramInfo &KernelInfo); void EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void emitPALFunctionMetadata(const MachineFunction &MF); void emitCommonFunctionComments(uint32_t NumVGPR, Optional NumAGPR, uint32_t TotalNumVGPR, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -456,9 +456,12 @@ Info = analyzeResourceUsage(MF); } - if (STM.isAmdPalOS() && MFI->isEntryFunction()) - EmitPALMetadata(MF, CurrentProgramInfo); - else if (!STM.isAmdHsaOS()) { + if (STM.isAmdPalOS()) { + if (MFI->isEntryFunction()) + EmitPALMetadata(MF, CurrentProgramInfo); + else + emitPALFunctionMetadata(MF); + } else if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, CurrentProgramInfo); } @@ -1266,6 +1269,15 @@ MD->setWave32(MF.getFunction().getCallingConv()); } +void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { + auto CC = MF.getFunction().getCallingConv(); + if (CC == CallingConv::AMDGPU_Gfx) { + auto *MD = getTargetStreamer()->getPALMetadata(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + MD->setStackFrameSize(MF, MFI.getStackSize()); + } +} + // This is supposed to be log2(Size) static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { switch (Size) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H #include "llvm/BinaryFormat/MsgPackDocument.h" +#include "llvm/CodeGen/MachineFunction.h" namespace llvm { @@ -26,6 +27,7 @@ msgpack::Document MsgPackDoc; msgpack::DocNode Registers; msgpack::DocNode HwStages; + msgpack::DocNode ShaderFunctions; public: // Read the amdgpu.pal.metadata supplied by the frontend, ready for @@ -76,6 +78,9 @@ // Set the scratch size in the metadata. void setScratchSize(unsigned CC, unsigned Val); + // Set the stack frame size of a function in the metadata. + void setStackFrameSize(const MachineFunction &MF, unsigned Val); + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void setWave32(unsigned CC); @@ -119,6 +124,12 @@ // Get (create if necessary) the registers map. msgpack::MapDocNode getRegisters(); + // Reference (create if necessary) the node for the shader functions map. + msgpack::DocNode &refShaderFunctions(); + + // Get (create if necessary) the shader functions map. + msgpack::MapDocNode getShaderFunctions(); + // Get (create if necessary) the .hardware_stages entry for the given calling // convention. msgpack::MapDocNode getHwStage(unsigned CC); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -238,6 +238,14 @@ getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val); } +// Set the scratch size in the metadata. +void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF, + unsigned Val) { + auto Node = MsgPackDoc.getMapNode(); + Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val); + getShaderFunctions()[MF.getFunction().getName()] = Node; +} + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void AMDGPUPALMetadata::setWave32(unsigned CC) { @@ -721,6 +729,24 @@ return Registers.getMap(); } +// Reference (create if necessary) the node for the shader functions map. +msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")]; + N.getMap(/*Convert=*/true); + return N; +} + +// Get (create if necessary) the shader functions map. +msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() { + if (ShaderFunctions.isEmpty()) + ShaderFunctions = refShaderFunctions(); + return ShaderFunctions.getMap(); +} + // Return the PAL metadata hardware shader stage name. static const char *getStageName(CallingConv::ID CC) { switch (CC) { diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -2,14 +2,113 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s -; GCN-LABEL: {{^}}gfx_callable_amdpal: -; GCN: .amdgpu_pal_metadata -; GCN-NEXT: --- -; GCN-NEXT: amdpal.pipelines: -; GCN-NEXT: - .registers: {} -; GCN-NEXT: ... -; GCN-NEXT: .end_amdgpu_pal_metadata -define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) { - %add = fadd half %arg0, 1.0 - ret half %add +declare float @extern_func(float) #0 +declare float @extern_func_many_args(<64 x float>) #0 + +@funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4 + +define amdgpu_gfx float @no_stack(float %arg0) #0 { + %add = fadd float %arg0, 1.0 + ret float %add } + +define amdgpu_gfx float @simple_stack(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %add = fadd float %arg0, %val + ret float %add +} + +; unsupported dynamic alloca +;define amdgpu_gfx float @dynamic_stack(float %arg0, i32 %size) #0 { +; %stack = alloca float, i32 %size, align 4, addrspace(5) +; store volatile float 2.0, float addrspace(5)* %stack +; %val = load volatile float, float addrspace(5)* %stack +; %add = fadd float %arg0, %val +; ret float %add +;} + +define amdgpu_gfx float @no_stack_call(float %arg0) #0 { + %res = call amdgpu_gfx float @simple_stack(float %arg0) + ret float %res +} + +define amdgpu_gfx float @simple_stack_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %res = call amdgpu_gfx float @simple_stack(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 { + %res = call amdgpu_gfx float @extern_func(float %arg0) + ret float %res +} + +define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %res = call amdgpu_gfx float @extern_func(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 { + %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0) + ret float %res +} + +define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 { + %fptr = load void()*, void()* addrspace(4)* @funcptr + call amdgpu_gfx void %fptr() + ret float %arg0 +} + +define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %fptr = load void()*, void()* addrspace(4)* @funcptr + call amdgpu_gfx void %fptr() + %add = fadd float %arg0, %val + ret float %add +} + +define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +attributes #0 = { nounwind } + +; GCN: amdpal.pipelines: +; GCN-NEXT: - .registers: {} +; GCN-NEXT: .shader_functions: +; GCN-NEXT: no_stack: +; GCN-NEXT: .stack_frame_size_in_bytes: 0 +; GCN-NEXT: no_stack_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0 +; GCN-NEXT: no_stack_extern_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x10 +; GCN-NEXT: no_stack_extern_call_many_args: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x90 +; GCN-NEXT: no_stack_indirect_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x10 +; GCN-NEXT: simple_stack: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x14 +; GCN-NEXT: simple_stack_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20 +; GCN-NEXT: simple_stack_extern_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20 +; GCN-NEXT: simple_stack_indirect_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20 +; GCN-NEXT: simple_stack_recurse: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20