Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 
+#include "AMDKernelCodeT.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include <cstddef>
@@ -89,6 +90,8 @@
   };
 
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
+                        const MachineFunction &MF) const;
   void findNumUsedRegistersSI(const MachineFunction &MF,
                               unsigned &NumSGPR,
                               unsigned &NumVGPR) const;
@@ -97,8 +100,6 @@
   /// can correctly setup the GPU state.
   void EmitProgramInfoR600(const MachineFunction &MF);
   void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
-  void EmitAmdKernelCodeT(const MachineFunction &MF,
-                          const SIProgramInfo &KernelInfo) const;
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -21,7 +21,6 @@
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
@@ -141,14 +140,18 @@
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
+  amd_kernel_code_t KernelCode;
   if (STM.isAmdCodeObjectV2(*MF)) {
     getSIProgramInfo(KernelInfo, *MF);
-    EmitAmdKernelCodeT(*MF, KernelInfo);
+    getAmdKernelCode(KernelCode, KernelInfo,  *MF);
+
+    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+    getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
   }
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     return;
-  getTargetStreamer().EmitKernelRuntimeMetadata(*MF->getFunction());
+  getTargetStreamer().EmitKernelRuntimeMetadata(*MF->getFunction(), KernelCode);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
@@ -724,94 +727,91 @@
   }
 }
 
-void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) const {
+void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
+                                        const SIProgramInfo &KernelInfo,
+                                        const MachineFunction &MF) const {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  amd_kernel_code_t header;
 
-  AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
 
-  header.compute_pgm_resource_registers =
+  Out.compute_pgm_resource_registers =
       KernelInfo.ComputePGMRSrc1 |
       (KernelInfo.ComputePGMRSrc2 << 32);
-  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+  Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 
 
-  AMD_HSA_BITS_SET(header.code_properties,
+  AMD_HSA_BITS_SET(Out.code_properties,
                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
 
   if (MFI->hasPrivateSegmentBuffer()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
   }
 
   if (MFI->hasDispatchPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   if (MFI->hasQueuePtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
 
   if (MFI->hasKernargSegmentPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
 
   if (MFI->hasDispatchID())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
 
   if (MFI->hasFlatScratchInit())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 
   // TODO: Private segment size
 
   if (MFI->hasGridWorkgroupCountX()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
   }
 
   if (MFI->hasGridWorkgroupCountY()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
   }
 
   if (MFI->hasGridWorkgroupCountZ()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
   }
 
   if (MFI->hasDispatchPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   if (STM.debuggerSupported())
-    header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
 
   if (STM.isXNACKEnabled())
-    header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
   // FIXME: Should use getKernArgSize
-  header.kernarg_segment_byte_size =
+  Out.kernarg_segment_byte_size =
     STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
-  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
-  header.workitem_vgpr_count = KernelInfo.NumVGPR;
-  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
-  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
-  header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
-  header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+  Out.wavefront_sgpr_count = KernelInfo.NumSGPR;
+  Out.workitem_vgpr_count = KernelInfo.NumVGPR;
+  Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+  Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+  Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
+  Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
-  header.kernarg_segment_alignment = std::max((size_t)4,
+  Out.kernarg_segment_alignment = std::max((size_t)4,
       countTrailingZeros(MFI->getMaxKernArgAlign()));
 
   if (STM.debuggerEmitPrologue()) {
-    header.debug_wavefront_private_segment_offset_sgpr =
+    Out.debug_wavefront_private_segment_offset_sgpr =
       KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
-    header.debug_private_segment_buffer_sgpr =
+    Out.debug_private_segment_buffer_sgpr =
       KernelInfo.DebuggerPrivateSegmentBufferSGPR;
   }
-
-  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
-  getTargetStreamer().EmitAMDKernelCodeT(header);
 }
 
 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
Index: lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
===================================================================
--- lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
+++ lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
@@ -127,6 +127,43 @@
     // Alignment of pointee type
     const char ArgPointeeAlign[] = "amd.ArgPointeeAlign";
 
+    const char KernelStatsWorkitemPrivateSegmentByteSize[] =
+        "amd.Kernel.Stats.WorkitemPrivateSegmentByteSize";
+    const char KernelStatsWorkgroupGroupSegmentByteSize[] =
+        "amd.Kernel.Stats.WorkgroupGroupSegmentByteSize";
+    const char KernelStatsGdsSegmentByteSize[] =
+        "amd.Kernel.Stats.GdsSegmentByteSize";
+    const char KernelStatsKernargSegmentByteSize[] =
+        "amd.Kernel.Stats.KernargSegmentByteSize";
+    const char KernelStatsWorkgroupNumFbarriers[] =
+        "amd.Kernel.Stats.WorkgroupNumFbarriers";
+    const char KernelStatsWavefrontNumSGPRs[] =
+        "amd.Kernel.Stats.WavefrontNumSGPRs";
+    const char KernelStatsWorkitemNumVGPRs[] =
+        "amd.Kernel.Stats.WorkitemNumVGPRs";
+    const char KernelStatsReservedFirstVGPR[] =
+        "amd.Kernel.Stats.ReservedFirstVGPR";
+    const char KernelStatsReservedNumVGPRs[] =
+        "amd.Kernel.Stats.ReservedNumVGPRs";
+    const char KernelStatsReservedFirstSGPR[] =
+        "amd.Kernel.Stats.ReservedFirstSGPR";
+    const char KernelStatsReservedNumSGPRs[] =
+        "amd.Kernel.Stats.ReservedNumSGPRs";
+    const char KernelStatsDebugWavefrontPrivateSegmentOffsetSGPR[] =
+        "amd.Kernel.Stats.DebugWavefrontPrivateSegmentOffsetSGPR";
+    const char KernelStatsDebugPrivateSegmentBufferSGPR[] =
+        "amd.Kernel.Stats.DebugPrivateSegmentBufferSGPR";
+    const char KernelStatsKernargSegmentAlignment[] =
+        "amd.Kernel.Stats.KernargSegmentAlignment";
+    const char KernelStatsGroupSegmentAlignment[] =
+        "amd.Kernel.Stats.GroupSegmentAlignment";
+    const char KernelStatsPrivateSegmentAlignment[] =
+        "amd.Kernel.Stats.PrivateSegmentAlignment";
+    const char KernelStatsWavefrontSize[] =
+        "amd.Kernel.Stats.WavefrontSize";
+    const char KernelStatsCallConvention[] =
+        "amd.Kernel.Stats.CallConvention";
+
   } // end namespace KeyName
 
   namespace KernelArg {
@@ -223,6 +260,24 @@
       uint32_t KernelIndex = INVALID_KERNEL_INDEX;
       uint8_t NoPartialWorkGroups = 0;
       std::vector<KernelArg::Metadata> Args;
+      uint32_t WorkitemPrivateSegmentByteSize = 0;
+      uint32_t WorkgroupGroupSegmentByteSize = 0;
+      uint32_t GdsSegmentByteSize = 0;
+      uint64_t KernargSegmentByteSize = 0;
+      uint32_t WorkgroupNumFbarriers = 0;
+      uint16_t WavefrontNumSGPRs = 0;
+      uint16_t WorkitemNumVGPRs = 0;
+      uint16_t ReservedFirstVGPR = 0;
+      uint16_t ReservedNumVGPRs = 0;
+      uint16_t ReservedFirstSGPR = 0;
+      uint16_t ReservedNumSGPRs = 0;
+      uint16_t DebugWavefrontPrivateSegmentOffsetSGPR = 0;
+      uint16_t DebugPrivateSegmentBufferSGPR = 0;
+      uint8_t KernargSegmentAlignment = 0;
+      uint8_t GroupSegmentAlignment = 0;
+      uint8_t PrivateSegmentAlignment = 0;
+      uint8_t WavefrontSize = 0;
+      int32_t CallConvention = 0;
 
       Metadata() = default;
     };
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.h
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.h
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMETADATASTREAMER_H
 
 #include "AMDGPURuntimeMetadata.h"
+#include "AMDKernelCodeT.h"
 #include "llvm/ADT/StringRef.h"
 #include <cstdint>
 #include <string>
@@ -57,6 +58,8 @@
 
   void streamHighLevelKernelMetadata(const Function &Func);
 
+  void streamKernelStatisticsMetadata(const amd_kernel_code_t &KernelCode);
+
   void streamKernelArgMetadata(const Argument &Arg);
 
   void streamKernelArgMetadata(const DataLayout &DL, Type *Ty,
@@ -73,7 +76,8 @@
 
   void streamEnd() {}
 
-  void streamKernelMetadata(const Function &Func);
+  void streamKernelMetadata(const Function &Func,
+                            const amd_kernel_code_t &KernelCode);
 
   std::string toYamlString();
 };
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.cpp
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.cpp
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.cpp
@@ -83,6 +83,61 @@
     YamlIO.mapOptional(
         KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups, uint8_t(0));
     YamlIO.mapRequired(KeyName::Args, K.Args);
+
+    YamlIO.mapRequired(
+        KeyName::KernelStatsWorkitemPrivateSegmentByteSize,
+        K.WorkitemPrivateSegmentByteSize);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsWorkgroupGroupSegmentByteSize,
+        K.WorkgroupGroupSegmentByteSize);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsGdsSegmentByteSize,
+        K.GdsSegmentByteSize);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsKernargSegmentByteSize,
+        K.KernargSegmentByteSize);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsWorkgroupNumFbarriers,
+        K.WorkgroupNumFbarriers);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsWavefrontNumSGPRs,
+        K.WavefrontNumSGPRs);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsWorkitemNumVGPRs,
+        K.WorkitemNumVGPRs);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsReservedFirstVGPR,
+        K.ReservedFirstVGPR);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsReservedNumVGPRs,
+        K.ReservedNumVGPRs);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsReservedFirstSGPR,
+        K.ReservedFirstSGPR);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsReservedNumSGPRs,
+        K.ReservedNumSGPRs);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsDebugWavefrontPrivateSegmentOffsetSGPR,
+        K.DebugWavefrontPrivateSegmentOffsetSGPR);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsDebugPrivateSegmentBufferSGPR,
+        K.DebugPrivateSegmentBufferSGPR);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsKernargSegmentAlignment,
+        K.KernargSegmentAlignment);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsGroupSegmentAlignment,
+        K.GroupSegmentAlignment);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsPrivateSegmentAlignment,
+        K.PrivateSegmentAlignment);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsWavefrontSize,
+        K.WavefrontSize);
+    YamlIO.mapRequired(
+        KeyName::KernelStatsCallConvention,
+        K.CallConvention);
   }
   static const bool flow = true;
 };
@@ -332,6 +387,36 @@
   }
 }
 
+void Streamer::streamKernelStatisticsMetadata(
+    const amd_kernel_code_t &KernelCode) {
+  auto &K = Program.Kernels.back();
+
+  K.WorkitemPrivateSegmentByteSize =
+      KernelCode.workitem_private_segment_byte_size;
+  K.WorkgroupGroupSegmentByteSize =
+      KernelCode.workgroup_group_segment_byte_size;
+  K.GdsSegmentByteSize =
+      KernelCode.gds_segment_byte_size;
+  K.KernargSegmentByteSize =
+      KernelCode.kernarg_segment_byte_size;
+  K.WorkgroupNumFbarriers = KernelCode.workgroup_fbarrier_count;
+  K.WavefrontNumSGPRs = KernelCode.wavefront_sgpr_count;
+  K.WorkitemNumVGPRs = KernelCode.workitem_vgpr_count;
+  K.ReservedFirstVGPR = KernelCode.reserved_vgpr_first;
+  K.ReservedNumVGPRs = KernelCode.reserved_vgpr_count;
+  K.ReservedFirstSGPR = KernelCode.reserved_sgpr_first;
+  K.ReservedNumSGPRs = KernelCode.reserved_sgpr_count;
+  K.DebugWavefrontPrivateSegmentOffsetSGPR =
+      KernelCode.debug_wavefront_private_segment_offset_sgpr;
+  K.DebugPrivateSegmentBufferSGPR =
+      KernelCode.debug_private_segment_buffer_sgpr;
+  K.KernargSegmentAlignment = KernelCode.kernarg_segment_alignment;
+  K.GroupSegmentAlignment = KernelCode.group_segment_alignment;
+  K.PrivateSegmentAlignment = KernelCode.private_segment_alignment;
+  K.WavefrontSize = KernelCode.wavefront_size;
+  K.CallConvention = KernelCode.call_convention;
+}
+
 void Streamer::streamKernelArgMetadata(const Argument &Arg) {
   auto &Func = *Arg.getParent();
   unsigned ArgNo = Arg.getArgNo();
@@ -444,7 +529,8 @@
   streamPrintfInfoMetadata(Mod);
 }
 
-void Streamer::streamKernelMetadata(const Function &Func) {
+void Streamer::streamKernelMetadata(const Function &Func,
+                                    const amd_kernel_code_t &KernelCode) {
   if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
     return;
 
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -52,7 +52,8 @@
   virtual void EmitStartOfRuntimeMetadata(const FeatureBitset &Features,
                                           const Module &Mod);
 
-  virtual void EmitKernelRuntimeMetadata(const Function &Func);
+  virtual void EmitKernelRuntimeMetadata(const Function &Func,
+                                         const amd_kernel_code_t &KernelCode);
 
   virtual void EmitEmitEndOfRuntimeMetadata();
 
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -44,8 +44,10 @@
   RuntimeMetadataStreamer.streamBegin(Features, Mod);
 }
 
-void AMDGPUTargetStreamer::EmitKernelRuntimeMetadata(const Function &Func) {
-  RuntimeMetadataStreamer.streamKernelMetadata(Func);
+void AMDGPUTargetStreamer::EmitKernelRuntimeMetadata(
+    const Function &Func,
+    const amd_kernel_code_t &KernelCode) {
+  RuntimeMetadataStreamer.streamKernelMetadata(Func, KernelCode);
 }
 
 void AMDGPUTargetStreamer::EmitEmitEndOfRuntimeMetadata() {