diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -78,6 +78,7 @@
                          const SIProgramInfo &KernelInfo);
   void EmitPALMetadata(const MachineFunction &MF,
                        const SIProgramInfo &KernelInfo);
+  void emitPALFunctionMetadata(const MachineFunction &MF);
   void emitCommonFunctionComments(uint32_t NumVGPR,
                                   Optional<uint32_t> NumAGPR,
                                   uint32_t TotalNumVGPR,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -456,9 +456,12 @@
     Info = analyzeResourceUsage(MF);
   }
 
-  if (STM.isAmdPalOS() && MFI->isEntryFunction())
-    EmitPALMetadata(MF, CurrentProgramInfo);
-  else if (!STM.isAmdHsaOS()) {
+  if (STM.isAmdPalOS()) {
+    if (MFI->isEntryFunction())
+      EmitPALMetadata(MF, CurrentProgramInfo);
+    else
+      emitPALFunctionMetadata(MF);
+  } else if (!STM.isAmdHsaOS()) {
     EmitProgramInfoSI(MF, CurrentProgramInfo);
   }
 
@@ -1266,6 +1269,15 @@
     MD->setWave32(MF.getFunction().getCallingConv());
 }
 
+void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
+  auto CC = MF.getFunction().getCallingConv();
+  if (CC == CallingConv::AMDGPU_Gfx) {
+    auto *MD = getTargetStreamer()->getPALMetadata();
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    MD->setStackFrameSize(MF, MFI.getStackSize());
+  }
+}
+
 // This is supposed to be log2(Size)
 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   switch (Size) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
 
 #include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
 
@@ -26,6 +27,7 @@
   msgpack::Document MsgPackDoc;
   msgpack::DocNode Registers;
   msgpack::DocNode HwStages;
+  msgpack::DocNode ShaderFunctions;
 
 public:
   // Read the amdgpu.pal.metadata supplied by the frontend, ready for
@@ -76,6 +78,9 @@
   // Set the scratch size in the metadata.
   void setScratchSize(unsigned CC, unsigned Val);
 
+  // Set the stack frame size of a function in the metadata.
+  void setStackFrameSize(const MachineFunction &MF, unsigned Val);
+
   // Set the hardware register bit in PAL metadata to enable wave32 on the
   // shader of the given calling convention.
   void setWave32(unsigned CC);
@@ -119,6 +124,12 @@
   // Get (create if necessary) the registers map.
   msgpack::MapDocNode getRegisters();
 
+  // Reference (create if necessary) the node for the shader functions map.
+  msgpack::DocNode &refShaderFunctions();
+
+  // Get (create if necessary) the shader functions map.
+  msgpack::MapDocNode getShaderFunctions();
+
   // Get (create if necessary) the .hardware_stages entry for the given calling
   // convention.
   msgpack::MapDocNode getHwStage(unsigned CC);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -238,6 +238,14 @@
   getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
 }
 
+// Set the scratch size in the metadata.
+void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF,
+                                          unsigned Val) {
+  auto Node = MsgPackDoc.getMapNode();
+  Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
+  getShaderFunctions()[MF.getFunction().getName()] = Node;
+}
+
 // Set the hardware register bit in PAL metadata to enable wave32 on the
 // shader of the given calling convention.
 void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -721,6 +729,24 @@
   return Registers.getMap();
 }
 
+// Reference (create if necessary) the node for the shader functions map.
+msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
+// Get (create if necessary) the shader functions map.
+msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() {
+  if (ShaderFunctions.isEmpty())
+    ShaderFunctions = refShaderFunctions();
+  return ShaderFunctions.getMap();
+}
+
 // Return the PAL metadata hardware shader stage name.
 static const char *getStageName(CallingConv::ID CC) {
   switch (CC) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -2,14 +2,113 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
 
-; GCN-LABEL: {{^}}gfx_callable_amdpal:
-; GCN:         .amdgpu_pal_metadata
-; GCN-NEXT: ---
-; GCN-NEXT: amdpal.pipelines:
-; GCN-NEXT:   - .registers:      {}
-; GCN-NEXT: ...
-; GCN-NEXT:         .end_amdgpu_pal_metadata
-define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) {
-  %add = fadd half %arg0, 1.0
-  ret half %add
+declare float @extern_func(float) #0
+declare float @extern_func_many_args(<64 x float>) #0
+
+@funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4
+
+define amdgpu_gfx float @no_stack(float %arg0) #0 {
+  %add = fadd float %arg0, 1.0
+  ret float %add
 }
+
+define amdgpu_gfx float @simple_stack(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+; unsupported dynamic alloca
+;define amdgpu_gfx float @dynamic_stack(float %arg0, i32 %size) #0 {
+;  %stack = alloca float, i32 %size, align 4, addrspace(5)
+;  store volatile float 2.0, float addrspace(5)* %stack
+;  %val = load volatile float, float addrspace(5)* %stack
+;  %add = fadd float %arg0, %val
+;  ret float %add
+;}
+
+define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
+  %res = call amdgpu_gfx float @simple_stack(float %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %res = call amdgpu_gfx float @simple_stack(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
+  %res = call amdgpu_gfx float @extern_func(float %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %res = call amdgpu_gfx float @extern_func(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
+  %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
+  %fptr = load void()*, void()* addrspace(4)* @funcptr
+  call amdgpu_gfx void %fptr()
+  ret float %arg0
+}
+
+define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %fptr = load void()*, void()* addrspace(4)* @funcptr
+  call amdgpu_gfx void %fptr()
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+attributes #0 = { nounwind }
+
+; GCN: amdpal.pipelines:
+; GCN-NEXT:   - .registers:      {}
+; GCN-NEXT:    .shader_functions:
+; GCN-NEXT:      no_stack:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0
+; GCN-NEXT:      no_stack_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0
+; GCN-NEXT:      no_stack_extern_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10
+; GCN-NEXT:      no_stack_extern_call_many_args:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x90
+; GCN-NEXT:      no_stack_indirect_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10
+; GCN-NEXT:      simple_stack:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14
+; GCN-NEXT:      simple_stack_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20
+; GCN-NEXT:      simple_stack_extern_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20
+; GCN-NEXT:      simple_stack_indirect_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20
+; GCN-NEXT:      simple_stack_recurse:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20