diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -78,6 +78,7 @@
                          const SIProgramInfo &KernelInfo);
   void EmitPALMetadata(const MachineFunction &MF,
                        const SIProgramInfo &KernelInfo);
+  void emitPALFunctionMetadata(const MachineFunction &MF);
   void emitCommonFunctionComments(uint32_t NumVGPR,
                                   Optional<uint32_t> NumAGPR,
                                   uint32_t TotalNumVGPR,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -456,9 +456,12 @@
     Info = analyzeResourceUsage(MF);
   }
 
-  if (STM.isAmdPalOS() && MFI->isEntryFunction())
-    EmitPALMetadata(MF, CurrentProgramInfo);
-  else if (!STM.isAmdHsaOS()) {
+  if (STM.isAmdPalOS()) {
+    if (MFI->isEntryFunction())
+      EmitPALMetadata(MF, CurrentProgramInfo);
+    else
+      emitPALFunctionMetadata(MF);
+  } else if (!STM.isAmdHsaOS()) {
     EmitProgramInfoSI(MF, CurrentProgramInfo);
   }
 
@@ -1260,6 +1263,12 @@
     MD->setWave32(MF.getFunction().getCallingConv());
 }
 
+void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
+  auto *MD = getTargetStreamer()->getPALMetadata();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MD->setStackFrameSize(MF, MFI.getStackSize());
+}
+
 // This is supposed to be log2(Size)
 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   switch (Size) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
 
 #include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
 
@@ -26,6 +27,7 @@
   msgpack::Document MsgPackDoc;
   msgpack::DocNode Registers;
   msgpack::DocNode HwStages;
+  msgpack::DocNode ShaderFunctions;
 
 public:
   // Read the amdgpu.pal.metadata supplied by the frontend, ready for
@@ -76,6 +78,9 @@
   // Set the scratch size in the metadata.
   void setScratchSize(unsigned CC, unsigned Val);
 
+  // Set the stack frame size of a function in the metadata.
+  void setStackFrameSize(const MachineFunction &MF, unsigned Val);
+
   // Set the hardware register bit in PAL metadata to enable wave32 on the
   // shader of the given calling convention.
   void setWave32(unsigned CC);
@@ -119,6 +124,12 @@
   // Get (create if necessary) the registers map.
   msgpack::MapDocNode getRegisters();
 
+  // Reference (create if necessary) the node for the shader functions map.
+  msgpack::DocNode &refShaderFunctions();
+
+  // Get (create if necessary) the shader functions map.
+  msgpack::MapDocNode getShaderFunctions();
+
   // Get (create if necessary) the .hardware_stages entry for the given calling
   // convention.
   msgpack::MapDocNode getHwStage(unsigned CC);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -238,6 +238,14 @@
   getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
 }
 
+// Set the scratch size in the metadata.
+void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF,
+                                          unsigned Val) {
+  auto Node = MsgPackDoc.getMapNode();
+  Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
+  getShaderFunctions()[MF.getFunction().getName()] = Node;
+}
+
 // Set the hardware register bit in PAL metadata to enable wave32 on the
 // shader of the given calling convention.
 void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -721,6 +729,24 @@
   return Registers.getMap();
 }
 
+// Reference (create if necessary) the node for the shader functions map.
+msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
+// Get (create if necessary) the shader functions map.
+msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() {
+  if (ShaderFunctions.isEmpty())
+    ShaderFunctions = refShaderFunctions();
+  return ShaderFunctions.getMap();
+}
+
 // Return the PAL metadata hardware shader stage name.
 static const char *getStageName(CallingConv::ID CC) {
   switch (CC) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -1,16 +1,161 @@
-; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
-
-; GCN-LABEL: {{^}}gfx_callable_amdpal:
-; GCN:         .amdgpu_pal_metadata
-; GCN-NEXT: ---
-; GCN-NEXT: amdpal.pipelines:
+; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL -enable-var-scope %s
+
+declare float @extern_func(float) #0
+declare float @extern_func_many_args(<64 x float>) #0
+
+@funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4
+
+define amdgpu_gfx float @no_stack(float %arg0) #0 {
+  %add = fadd float %arg0, 1.0
+  ret float %add
+}
+
+define amdgpu_gfx float @simple_stack(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %add = fadd float %arg0, %val
+  %stack2 = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack2
+  %val2 = load volatile float, float addrspace(5)* %stack2
+  %add2 = fadd float %add, %val2
+  ret float %add2
+}
+
+define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
+bb0:
+  %cmp = fcmp ogt float %arg0, 0.0
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %add = fadd float %arg0, %val
+  br label %bb2
+
+bb2:
+  %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
+  ret float %res
+}
+
+define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
+bb0:
+  br label %bb1
+
+bb1:
+  %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %add = fadd float %arg0, %val
+  %cmp = icmp sgt i32 %ctr, 0
+  %newctr = sub i32 %ctr, 1
+  br i1 %cmp, label %bb1, label %bb2
+
+bb2:
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
+  %res = call amdgpu_gfx float @simple_stack(float %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %res = call amdgpu_gfx float @simple_stack(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
+  %res = call amdgpu_gfx float @extern_func(float %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %res = call amdgpu_gfx float @extern_func(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
+  %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
+  %fptr = load void()*, void()* addrspace(4)* @funcptr
+  call amdgpu_gfx void %fptr()
+  ret float %arg0
+}
+
+define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %fptr = load void()*, void()* addrspace(4)* @funcptr
+  call amdgpu_gfx void %fptr()
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+attributes #0 = { nounwind }
+
+; GCN: amdpal.pipelines:
 ; GCN-NEXT:   - .registers:      {}
+; GCN-NEXT:    .shader_functions:
+; GCN-NEXT:      dynamic_stack:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
+; GCN-NEXT:      dynamic_stack_loop:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
+; GCN-NEXT:      multiple_stack:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x24{{$}}
+; GCN-NEXT:      no_stack:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
+; GCN-NEXT:      no_stack_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
+; GCN-NEXT:      no_stack_extern_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
+; GCN-NEXT:      no_stack_extern_call_many_args:
+; SDAG-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
+; GISEL-NEXT:        .stack_frame_size_in_bytes: 0xd0{{$}}
+; GCN-NEXT:      no_stack_indirect_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
+; GCN-NEXT:      simple_stack:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
+; GCN-NEXT:      simple_stack_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
+; GCN-NEXT:      simple_stack_extern_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
+; GCN-NEXT:      simple_stack_indirect_call:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
+; GCN-NEXT:      simple_stack_recurse:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
 ; GCN-NEXT: ...
-; GCN-NEXT:         .end_amdgpu_pal_metadata
-define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) {
-  %add = fadd half %arg0, 1.0
-  ret half %add
-}