diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -78,6 +78,7 @@ const SIProgramInfo &KernelInfo); void EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void emitPALFunctionMetadata(const MachineFunction &MF); void emitCommonFunctionComments(uint32_t NumVGPR, Optional NumAGPR, uint32_t TotalNumVGPR, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -456,9 +456,12 @@ Info = analyzeResourceUsage(MF); } - if (STM.isAmdPalOS() && MFI->isEntryFunction()) - EmitPALMetadata(MF, CurrentProgramInfo); - else if (!STM.isAmdHsaOS()) { + if (STM.isAmdPalOS()) { + if (MFI->isEntryFunction()) + EmitPALMetadata(MF, CurrentProgramInfo); + else + emitPALFunctionMetadata(MF); + } else if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, CurrentProgramInfo); } @@ -1260,6 +1263,12 @@ MD->setWave32(MF.getFunction().getCallingConv()); } +void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { + auto *MD = getTargetStreamer()->getPALMetadata(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + MD->setStackFrameSize(MF, MFI.getStackSize()); +} + // This is supposed to be log2(Size) static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { switch (Size) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H #include "llvm/BinaryFormat/MsgPackDocument.h" +#include "llvm/CodeGen/MachineFunction.h" namespace llvm { @@ -26,6 +27,7 @@ msgpack::Document MsgPackDoc; msgpack::DocNode Registers; msgpack::DocNode HwStages; + msgpack::DocNode ShaderFunctions; public: // Read the amdgpu.pal.metadata supplied by the frontend, ready for @@ -76,6 +78,9 @@ // Set the scratch size in the metadata. void setScratchSize(unsigned CC, unsigned Val); + // Set the stack frame size of a function in the metadata. + void setStackFrameSize(const MachineFunction &MF, unsigned Val); + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void setWave32(unsigned CC); @@ -119,6 +124,12 @@ // Get (create if necessary) the registers map. msgpack::MapDocNode getRegisters(); + // Reference (create if necessary) the node for the shader functions map. + msgpack::DocNode &refShaderFunctions(); + + // Get (create if necessary) the shader functions map. + msgpack::MapDocNode getShaderFunctions(); + // Get (create if necessary) the .hardware_stages entry for the given calling // convention. msgpack::MapDocNode getHwStage(unsigned CC); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -238,6 +238,14 @@ getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val); } +// Set the scratch size in the metadata. +void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF, + unsigned Val) { + auto Node = MsgPackDoc.getMapNode(); + Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val); + getShaderFunctions()[MF.getFunction().getName()] = Node; +} + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void AMDGPUPALMetadata::setWave32(unsigned CC) { @@ -721,6 +729,24 @@ return Registers.getMap(); } +// Reference (create if necessary) the node for the shader functions map. +msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")]; + N.getMap(/*Convert=*/true); + return N; +} + +// Get (create if necessary) the shader functions map. +msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() { + if (ShaderFunctions.isEmpty()) + ShaderFunctions = refShaderFunctions(); + return ShaderFunctions.getMap(); +} + // Return the PAL metadata hardware shader stage name. static const char *getStageName(CallingConv::ID CC) { switch (CC) { diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -1,16 +1,161 @@ -; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s -; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s - -; GCN-LABEL: {{^}}gfx_callable_amdpal: -; GCN: .amdgpu_pal_metadata -; GCN-NEXT: --- -; GCN-NEXT: amdpal.pipelines: +; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s +; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL -enable-var-scope %s + +declare float @extern_func(float) #0 +declare float @extern_func_many_args(<64 x float>) #0 + +@funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4 + +define amdgpu_gfx float @no_stack(float %arg0) #0 { + %add = fadd float %arg0, 1.0 + ret float %add +} + +define amdgpu_gfx float @simple_stack(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %add = fadd float %arg0, %val + ret float %add +} + +define amdgpu_gfx float @multiple_stack(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %add = fadd float %arg0, %val + %stack2 = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack2 + %val2 = load volatile float, float addrspace(5)* %stack2 + %add2 = fadd float %add, %val2 + ret float %add2 +} + +define amdgpu_gfx float @dynamic_stack(float %arg0) #0 { +bb0: + %cmp = fcmp ogt float %arg0, 0.0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %add = fadd float %arg0, %val + br label %bb2 + +bb2: + %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ] + ret float %res +} + +define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 { +bb0: + br label %bb1 + +bb1: + %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ] + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %add = fadd float %arg0, %val + %cmp = icmp sgt i32 %ctr, 0 + %newctr = sub i32 %ctr, 1 + br i1 %cmp, label %bb1, label %bb2 + +bb2: + ret float %add +} + +define amdgpu_gfx float @no_stack_call(float %arg0) #0 { + %res = call amdgpu_gfx float @simple_stack(float %arg0) + ret float %res +} + +define amdgpu_gfx float @simple_stack_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %res = call amdgpu_gfx float @simple_stack(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 { + %res = call amdgpu_gfx float @extern_func(float %arg0) + ret float %res +} + +define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %res = call amdgpu_gfx float @extern_func(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 { + %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0) + ret float %res +} + +define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 { + %fptr = load void()*, void()* addrspace(4)* @funcptr + call amdgpu_gfx void %fptr() + ret float %arg0 +} + +define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %fptr = load void()*, void()* addrspace(4)* @funcptr + call amdgpu_gfx void %fptr() + %add = fadd float %arg0, %val + ret float %add +} + +define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, float addrspace(5)* %stack + %val = load volatile float, float addrspace(5)* %stack + %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +attributes #0 = { nounwind } + +; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: {} +; GCN-NEXT: .shader_functions: +; GCN-NEXT: dynamic_stack: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} +; GCN-NEXT: dynamic_stack_loop: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} +; GCN-NEXT: multiple_stack: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x24{{$}} +; GCN-NEXT: no_stack: +; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}} +; GCN-NEXT: no_stack_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}} +; GCN-NEXT: no_stack_extern_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} +; GCN-NEXT: no_stack_extern_call_many_args: +; SDAG-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} +; GISEL-NEXT: .stack_frame_size_in_bytes: 0xd0{{$}} +; GCN-NEXT: no_stack_indirect_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} +; GCN-NEXT: simple_stack: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x14{{$}} +; GCN-NEXT: simple_stack_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} +; GCN-NEXT: simple_stack_extern_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} +; GCN-NEXT: simple_stack_indirect_call: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} +; GCN-NEXT: simple_stack_recurse: +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: ... -; GCN-NEXT: .end_amdgpu_pal_metadata -define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) { - %add = fadd half %arg0, 1.0 - ret half %add -}