Index: include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h
===================================================================
--- /dev/null
+++ include/llvm/BinaryFormat/AMDGPUMetadataVerifier.h
@@ -0,0 +1,70 @@
+//===- AMDGPUMetadataVerifier.h - MsgPack Types -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This is a verifier for AMDGPU HSA metadata, which can verify both
+/// well-typed metadata and untyped metadata. When verifying in the non-strict
+/// mode, untyped metadata is coerced into the correct type if possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H
+#define LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H
+
+#include "llvm/BinaryFormat/MsgPackTypes.h"
+
+namespace llvm {
+namespace AMDGPU {
+namespace HSAMD {
+namespace V3 {
+
+/// Verifier for AMDGPU HSA metadata.
+///
+/// Operates in two modes:
+///
+/// In strict mode, metadata must already be well-typed.
+///
+/// In non-strict mode, metadata is coerced into expected types when possible.
+class MetadataVerifier {
+  bool Strict;
+
+  bool verifyScalar(msgpack::Node &Node, msgpack::ScalarNode::ScalarKind SKind,
+                    function_ref<bool(msgpack::ScalarNode &)> verifyValue = {});
+  bool verifyInteger(msgpack::Node &Node);
+  bool verifyArray(msgpack::Node &Node,
+                   function_ref<bool(msgpack::Node &)> verifyNode,
+                   Optional<size_t> Size = None);
+  bool verifyEntry(msgpack::MapNode<> &MapNode, StringRef Key, bool Required,
+                   function_ref<bool(msgpack::Node &)> verifyNode);
+  bool
+  verifyScalarEntry(msgpack::MapNode<> &MapNode, StringRef Key, bool Required,
+                    msgpack::ScalarNode::ScalarKind SKind,
+                    function_ref<bool(msgpack::ScalarNode &)> verifyValue = {});
+  bool verifyIntegerEntry(msgpack::MapNode<> &MapNode, StringRef Key,
+                          bool Required);
+  bool verifyKernelArgs(msgpack::Node &Node);
+  bool verifyKernel(msgpack::Node &Node);
+
+public:
+  /// Construct a MetadataVerifier, specifying whether it will operate in \p
+  /// Strict mode.
+  MetadataVerifier(bool Strict) : Strict(Strict) {}
+
+  /// Verify given HSA metadata.
+  ///
+  /// \returns True when successful, false when metadata is invalid.
+  bool verify(std::unique_ptr<msgpack::Node> &HSAMetadataRoot);
+};
+
+} // end namespace V3
+} // end namespace HSAMD
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_BINARYFORMAT_AMDGPUMETADATAVERIFIER_H
Index: include/llvm/BinaryFormat/ELF.h
===================================================================
--- include/llvm/BinaryFormat/ELF.h
+++ include/llvm/BinaryFormat/ELF.h
@@ -1318,7 +1318,7 @@
   GNU_PROPERTY_X86_FEATURE_1_SHSTK = 1 << 1
 };
 
-// AMDGPU specific notes.
+// AMD specific notes. (Code Object V2)
 enum {
   // Note types with values between 0 and 9 (inclusive) are reserved.
   NT_AMD_AMDGPU_HSA_METADATA = 10,
@@ -1326,6 +1326,12 @@
   NT_AMD_AMDGPU_PAL_METADATA = 12
 };
 
+// AMDGPU specific notes. (Code Object V3)
+enum {
+  // Note type value 0 is reserved.
+  NT_AMDGPU_METADATA = 1
+};
+
 enum {
   GNU_ABI_TAG_LINUX = 0,
   GNU_ABI_TAG_HURD = 1,
Index: include/llvm/Support/AMDGPUMetadata.h
===================================================================
--- include/llvm/Support/AMDGPUMetadata.h
+++ include/llvm/Support/AMDGPUMetadata.h
@@ -431,6 +431,133 @@
 /// Converts \p HSAMetadata to \p String.
 std::error_code toString(Metadata HSAMetadata, std::string &String);
 
+//===----------------------------------------------------------------------===//
+// HSA metadata for v3 code object.
+//===----------------------------------------------------------------------===//
+namespace V3 {
+
+/// HSA metadata major version.
+constexpr uint32_t VersionMajor = 1;
+/// HSA metadata minor version.
+constexpr uint32_t VersionMinor = 0;
+
+//===----------------------------------------------------------------------===//
+// Kernel Metadata.
+//===----------------------------------------------------------------------===//
+namespace Kernel {
+
+//===----------------------------------------------------------------------===//
+// Kernel Argument Metadata.
+//===----------------------------------------------------------------------===//
+namespace Arg {
+
+namespace Key {
+constexpr char Name[] = ".name";
+constexpr char TypeName[] = ".type_name";
+constexpr char Size[] = ".size";
+constexpr char Offset[] = ".offset";
+constexpr char ValueKind[] = ".value_kind";
+constexpr char ValueType[] = ".value_type";
+constexpr char PointeeAlign[] = ".pointee_align";
+constexpr char AddressSpace[] = ".address_space";
+constexpr char Access[] = ".access";
+constexpr char ActualAccess[] = ".actual_access";
+constexpr char IsConst[] = ".is_const";
+constexpr char IsRestrict[] = ".is_restrict";
+constexpr char IsVolatile[] = ".is_volatile";
+constexpr char IsPipe[] = ".is_pipe";
+} // end namespace Key
+
+namespace Access {
+constexpr char ReadOnly[] = "read_only";
+constexpr char WriteOnly[] = "write_only";
+constexpr char ReadWrite[] = "read_write";
+} // end namespace Access
+
+namespace AddressSpace {
+constexpr char Private[] = "private";
+constexpr char Global[] = "global";
+constexpr char Constant[] = "constant";
+constexpr char Local[] = "local";
+constexpr char Generic[] = "generic";
+constexpr char Region[] = "region";
+} // end namespace AddressSpace
+
+namespace ValueKind {
+constexpr char ByValue[] = "by_value";
+constexpr char GlobalBuffer[] = "global_buffer";
+constexpr char DynamicSharedPointer[] = "dynamic_shared_pointer";
+constexpr char Sampler[] = "sampler";
+constexpr char Image[] = "image";
+constexpr char Pipe[] = "pipe";
+constexpr char Queue[] = "queue";
+constexpr char HiddenGlobalOffsetX[] = "hidden_global_offset_x";
+constexpr char HiddenGlobalOffsetY[] = "hidden_global_offset_y";
+constexpr char HiddenGlobalOffsetZ[] = "hidden_global_offset_z";
+constexpr char HiddenNone[] = "hidden_none";
+constexpr char HiddenPrintfBuffer[] = "hidden_printf_buffer";
+constexpr char HiddenDefaultQueue[] = "hidden_default_queue";
+constexpr char HiddenCompletionAction[] = "hidden_completion_action";
+} // end namespace ValueKind
+
+namespace ValueType {
+constexpr char Struct[] = "struct";
+constexpr char I8[] = "i8";
+constexpr char U8[] = "u8";
+constexpr char I16[] = "i16";
+constexpr char U16[] = "u16";
+constexpr char F16[] = "f16";
+constexpr char I32[] = "i32";
+constexpr char U32[] = "u32";
+constexpr char F32[] = "f32";
+constexpr char I64[] = "i64";
+constexpr char U64[] = "u64";
+constexpr char F64[] = "f64";
+} // end namespace ValueType
+
+} // end namespace Arg
+
+namespace Key {
+constexpr char Name[] = ".name";
+constexpr char Symbol[] = ".symbol";
+constexpr char Language[] = ".language";
+constexpr char LanguageVersion[] = ".language_version";
+constexpr char Args[] = ".args";
+constexpr char ReqdWorkGroupSize[] = ".reqd_workgroup_size";
+constexpr char WorkGroupSizeHint[] = ".workgroup_size_hint";
+constexpr char VecTypeHint[] = ".vec_type_hint";
+constexpr char DeviceEnqueueSymbol[] = ".device_enqueue_symbol";
+constexpr char KernargSegmentSize[] = ".kernarg_segment_size";
+constexpr char GroupSegmentFixedSize[] = ".group_segment_fixed_size";
+constexpr char PrivateSegmentFixedSize[] = ".private_segment_fixed_size";
+constexpr char KernargSegmentAlign[] = ".kernarg_segment_align";
+constexpr char WavefrontSize[] = ".wavefront_size";
+constexpr char SGPRCount[] = ".sgpr_count";
+constexpr char VGPRCount[] = ".vgpr_count";
+constexpr char MaxFlatWorkGroupSize[] = ".max_flat_workgroup_size";
+constexpr char SGPRSpillCount[] = ".sgpr_spill_count";
+constexpr char VGPRSpillCount[] = ".vgpr_spill_count";
+} // end namespace Key
+
+namespace Language {
+constexpr char OpenCLC[] = "OpenCL C";
+constexpr char OpenCLCXX[] = "OpenCL C++";
+constexpr char HCC[] = "HCC";
+constexpr char HIP[] = "HIP";
+constexpr char OpenMP[] = "OpenMP";
+constexpr char Assembler[] = "Assembler";
+} // end namespace Language
+
+} // end namespace Kernel
+
+namespace Key {
+constexpr char Version[] = "amdhsa.version";
+constexpr char Printf[] = "amdhsa.printf";
+constexpr char Kernels[] = "amdhsa.kernels";
+} // end namespace Key
+
+} // end namespace V3
+
 } // end namespace HSAMD
 
 //===----------------------------------------------------------------------===//
Index: lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
===================================================================
--- /dev/null
+++ lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -0,0 +1,325 @@
+//===- AMDGPUMetadataVerifier.cpp - MsgPack Types ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Implements a verifier for AMDGPU HSA metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
+#include "llvm/Support/AMDGPUMetadata.h"
+
+namespace llvm {
+namespace AMDGPU {
+namespace HSAMD {
+namespace V3 {
+
+bool MetadataVerifier::verifyScalar(
+    msgpack::Node &Node, msgpack::ScalarNode::ScalarKind SKind,
+    function_ref<bool(msgpack::ScalarNode &)> verifyValue) {
+  auto ScalarPtr = dyn_cast<msgpack::ScalarNode>(&Node);
+  if (!ScalarPtr)
+    return false;
+  auto &Scalar = *ScalarPtr;
+  // Do not output extraneous tags for types we know from the spec.
+  Scalar.IgnoreTag = true;
+  if (Scalar.getScalarKind() != SKind) {
+    if (Strict)
+      return false;
+    // If we are not strict, we interpret string values as "implicitly typed"
+    // and attempt to coerce them to the expected type here.
+    if (Scalar.getScalarKind() != msgpack::ScalarNode::SK_String)
+      return false;
+    std::string StringValue = Scalar.getString();
+    Scalar.setScalarKind(SKind);
+    if (Scalar.inputYAML(StringValue) != StringRef())
+      return false;
+  }
+  if (verifyValue)
+    return verifyValue(Scalar);
+  return true;
+}
+
+bool MetadataVerifier::verifyInteger(msgpack::Node &Node) {
+  if (!verifyScalar(Node, msgpack::ScalarNode::SK_UInt))
+    if (!verifyScalar(Node, msgpack::ScalarNode::SK_Int))
+      return false;
+  return true;
+}
+
+bool MetadataVerifier::verifyArray(
+    msgpack::Node &Node, function_ref<bool(msgpack::Node &)> verifyNode,
+    Optional<size_t> Size) {
+  auto ArrayPtr = dyn_cast<msgpack::ArrayNode<>>(&Node);
+  if (!ArrayPtr)
+    return false;
+  auto &Array = *ArrayPtr;
+  if (Size && Array.size() != *Size)
+    return false;
+  for (auto &Item : Array)
+    if (!verifyNode(*Item.get()))
+      return false;
+
+  return true;
+}
+
+bool MetadataVerifier::verifyEntry(
+    msgpack::MapNode<> &MapNode, StringRef Key, bool Required,
+    function_ref<bool(msgpack::Node &)> verifyNode) {
+  auto Entry = MapNode.find(Key);
+  if (Entry == MapNode.end())
+    return !Required;
+  return verifyNode(*Entry->second.get());
+}
+
+bool MetadataVerifier::verifyScalarEntry(
+    msgpack::MapNode<> &MapNode, StringRef Key, bool Required,
+    msgpack::ScalarNode::ScalarKind SKind,
+    function_ref<bool(msgpack::ScalarNode &)> verifyValue) {
+  return verifyEntry(MapNode, Key, Required, [=](msgpack::Node &Node) {
+    return verifyScalar(Node, SKind, verifyValue);
+  });
+}
+
+bool MetadataVerifier::verifyIntegerEntry(msgpack::MapNode<> &MapNode,
+                                             StringRef Key, bool Required) {
+  return verifyEntry(MapNode, Key, Required, [this](msgpack::Node &Node) {
+    return verifyInteger(Node);
+  });
+}
+
+bool MetadataVerifier::verifyKernelArgs(msgpack::Node &Node) {
+  auto ArgsMapPtr = dyn_cast<msgpack::MapNode<>>(&Node);
+  if (!ArgsMapPtr)
+    return false;
+  auto &ArgsMap = *ArgsMapPtr;
+
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::Name, false,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::TypeName, false,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyIntegerEntry(ArgsMap, Kernel::Arg::Key::Size, true))
+    return false;
+  if (!verifyIntegerEntry(ArgsMap, Kernel::Arg::Key::Offset, true))
+    return false;
+  if (!verifyScalarEntry(
+          ArgsMap, Kernel::Arg::Key::ValueKind, true,
+          msgpack::ScalarNode::SK_String, [this](msgpack::ScalarNode &SNode) {
+            return StringSwitch<bool>(SNode.getString())
+                .Case(Kernel::Arg::ValueKind::ByValue, true)
+                .Case(Kernel::Arg::ValueKind::GlobalBuffer, true)
+                .Case(Kernel::Arg::ValueKind::DynamicSharedPointer, true)
+                .Case(Kernel::Arg::ValueKind::Sampler, true)
+                .Case(Kernel::Arg::ValueKind::Image, true)
+                .Case(Kernel::Arg::ValueKind::Pipe, true)
+                .Case(Kernel::Arg::ValueKind::Queue, true)
+                .Case(Kernel::Arg::ValueKind::HiddenGlobalOffsetX, true)
+                .Case(Kernel::Arg::ValueKind::HiddenGlobalOffsetY, true)
+                .Case(Kernel::Arg::ValueKind::HiddenGlobalOffsetZ, true)
+                .Case(Kernel::Arg::ValueKind::HiddenNone, true)
+                .Case(Kernel::Arg::ValueKind::HiddenPrintfBuffer, true)
+                .Case(Kernel::Arg::ValueKind::HiddenDefaultQueue, true)
+                .Case(Kernel::Arg::ValueKind::HiddenCompletionAction, true)
+                .Default(false);
+          }))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::ValueType, true,
+                         msgpack::ScalarNode::SK_String,
+                         [this](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case(Kernel::Arg::ValueType::Struct, true)
+                               .Case(Kernel::Arg::ValueType::I8, true)
+                               .Case(Kernel::Arg::ValueType::U8, true)
+                               .Case(Kernel::Arg::ValueType::I16, true)
+                               .Case(Kernel::Arg::ValueType::U16, true)
+                               .Case(Kernel::Arg::ValueType::F16, true)
+                               .Case(Kernel::Arg::ValueType::I32, true)
+                               .Case(Kernel::Arg::ValueType::U32, true)
+                               .Case(Kernel::Arg::ValueType::F32, true)
+                               .Case(Kernel::Arg::ValueType::I64, true)
+                               .Case(Kernel::Arg::ValueType::U64, true)
+                               .Case(Kernel::Arg::ValueType::F64, true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyIntegerEntry(ArgsMap, Kernel::Arg::Key::PointeeAlign, false))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::AddressSpace, false,
+                         msgpack::ScalarNode::SK_String,
+                         [this](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case(Kernel::Arg::AddressSpace::Private, true)
+                               .Case(Kernel::Arg::AddressSpace::Global, true)
+                               .Case(Kernel::Arg::AddressSpace::Constant, true)
+                               .Case(Kernel::Arg::AddressSpace::Local, true)
+                               .Case(Kernel::Arg::AddressSpace::Generic, true)
+                               .Case(Kernel::Arg::AddressSpace::Region, true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::Access, false,
+                         msgpack::ScalarNode::SK_String,
+                         [this](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case(Kernel::Arg::Access::ReadOnly, true)
+                               .Case(Kernel::Arg::Access::WriteOnly, true)
+                               .Case(Kernel::Arg::Access::ReadWrite, true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::ActualAccess, false,
+                         msgpack::ScalarNode::SK_String,
+                         [this](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case(Kernel::Arg::Access::ReadOnly, true)
+                               .Case(Kernel::Arg::Access::WriteOnly, true)
+                               .Case(Kernel::Arg::Access::ReadWrite, true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::IsConst, false,
+                         msgpack::ScalarNode::SK_Boolean))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::IsRestrict, false,
+                         msgpack::ScalarNode::SK_Boolean))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::IsVolatile, false,
+                         msgpack::ScalarNode::SK_Boolean))
+    return false;
+  if (!verifyScalarEntry(ArgsMap, Kernel::Arg::Key::IsPipe, false,
+                         msgpack::ScalarNode::SK_Boolean))
+    return false;
+
+  return true;
+}
+
+bool MetadataVerifier::verifyKernel(msgpack::Node &Node) {
+  auto KernelMapPtr = dyn_cast<msgpack::MapNode<>>(&Node);
+  if (!KernelMapPtr)
+    return false;
+  auto &KernelMap = *KernelMapPtr;
+
+  if (!verifyScalarEntry(KernelMap, Kernel::Key::Name, true,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyScalarEntry(KernelMap, Kernel::Key::Symbol, true,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyScalarEntry(KernelMap, Kernel::Key::Language, false,
+                         msgpack::ScalarNode::SK_String,
+                         [this](msgpack::ScalarNode &SNode) {
+                           return StringSwitch<bool>(SNode.getString())
+                               .Case(Kernel::Language::OpenCLC, true)
+                               .Case(Kernel::Language::OpenCLCXX, true)
+                               .Case(Kernel::Language::HCC, true)
+                               .Case(Kernel::Language::HIP, true)
+                               .Case(Kernel::Language::OpenMP, true)
+                               .Case(Kernel::Language::Assembler, true)
+                               .Default(false);
+                         }))
+    return false;
+  if (!verifyEntry(
+          KernelMap, Kernel::Key::LanguageVersion, false,
+          [this](msgpack::Node &Node) {
+            return verifyArray(
+                Node,
+                [this](msgpack::Node &Node) { return verifyInteger(Node); }, 2);
+          }))
+    return false;
+  if (!verifyEntry(KernelMap, Kernel::Key::Args, false,
+                   [this](msgpack::Node &Node) {
+                     return verifyArray(Node, [this](msgpack::Node &Node) {
+                       return verifyKernelArgs(Node);
+                     });
+                   }))
+    return false;
+  if (!verifyEntry(KernelMap, Kernel::Key::ReqdWorkGroupSize, false,
+                   [this](msgpack::Node &Node) {
+                     return verifyArray(Node,
+                                        [this](msgpack::Node &Node) {
+                                          return verifyInteger(Node);
+                                        },
+                                        3);
+                   }))
+    return false;
+  if (!verifyEntry(KernelMap, Kernel::Key::WorkGroupSizeHint, false,
+                   [this](msgpack::Node &Node) {
+                     return verifyArray(Node,
+                                        [this](msgpack::Node &Node) {
+                                          return verifyInteger(Node);
+                                        },
+                                        3);
+                   }))
+    return false;
+  if (!verifyScalarEntry(KernelMap, Kernel::Key::VecTypeHint, false,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyScalarEntry(KernelMap, Kernel::Key::DeviceEnqueueSymbol, false,
+                         msgpack::ScalarNode::SK_String))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::KernargSegmentSize, true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::GroupSegmentFixedSize, true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::PrivateSegmentFixedSize,
+                          true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::KernargSegmentAlign, true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::WavefrontSize, true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::SGPRCount, true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::VGPRCount, true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::MaxFlatWorkGroupSize, true))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::SGPRSpillCount, false))
+    return false;
+  if (!verifyIntegerEntry(KernelMap, Kernel::Key::VGPRSpillCount, false))
+    return false;
+
+  return true;
+}
+
+bool MetadataVerifier::verify(
+    std::unique_ptr<msgpack::Node> &HSAMetadataRoot) {
+  auto RootMapPtr = dyn_cast<msgpack::MapNode<>>(HSAMetadataRoot.get());
+  if (!RootMapPtr)
+    return false;
+  auto &RootMap = *RootMapPtr;
+
+  if (!verifyEntry(
+          RootMap, Key::Version, true, [this](msgpack::Node &Node) {
+            return verifyArray(
+                Node,
+                [this](msgpack::Node &Node) { return verifyInteger(Node); }, 2);
+          }))
+    return false;
+  if (!verifyEntry(RootMap, Key::Printf, false, [this](msgpack::Node &Node) {
+        return verifyArray(Node, [this](msgpack::Node &Node) {
+          return verifyScalar(Node, msgpack::ScalarNode::SK_String);
+        });
+      }))
+    return false;
+  if (!verifyEntry(RootMap, Key::Kernels, true, [this](msgpack::Node &Node) {
+        return verifyArray(
+            Node, [this](msgpack::Node &Node) { return verifyKernel(Node); });
+      }))
+    return false;
+
+  return true;
+}
+
+} // end namespace V3
+} // end namespace HSAMD
+} // end namespace AMDGPU
+} // end namespace llvm
Index: lib/BinaryFormat/CMakeLists.txt
===================================================================
--- lib/BinaryFormat/CMakeLists.txt
+++ lib/BinaryFormat/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMBinaryFormat
+  AMDGPUMetadataVerifier.cpp
   Dwarf.cpp
   Magic.cpp
   MsgPackReader.cpp
Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -56,7 +56,7 @@
   SIProgramInfo CurrentProgramInfo;
   DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
 
-  AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream;
+  AMDGPU::HSAMD::MetadataStreamer *HSAMetadataStream;
   std::map<uint32_t, uint32_t> PALMetadataMap;
 
   uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
@@ -92,6 +92,7 @@
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
+  ~AMDGPUAsmPrinter();
 
   StringRef getPassName() const override;
 
Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -45,6 +45,7 @@
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::HSAMD;
 
 // TODO: This should get the default rounding mode from the kernel. We just set
 // the default here, but this could change if the OpenCL rounding mode pragmas
@@ -99,8 +100,16 @@
                                    std::unique_ptr<MCStreamer> Streamer)
   : AsmPrinter(TM, std::move(Streamer)) {
     AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
+    if (IsaInfo::hasCodeObjectV3(getSTI()))
+      HSAMetadataStream = new MetadataStreamerV3();
+    else
+      HSAMetadataStream = new MetadataStreamerV2();
   }
 
+AMDGPUAsmPrinter::~AMDGPUAsmPrinter() {
+  delete HSAMetadataStream;
+}
+
 StringRef AMDGPUAsmPrinter::getPassName() const {
   return "AMDGPU Assembly Printer";
 }
@@ -116,16 +125,12 @@
 }
 
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
-
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
       TM.getTargetTriple().getOS() != Triple::AMDPAL)
     return;
 
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    HSAMetadataStream.begin(M);
+    HSAMetadataStream->begin(M);
 
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     readPALMetadata(M);
@@ -141,37 +146,38 @@
 }
 
 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  // TODO: Add metadata to code object v3.
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
-
   // Following code requires TargetStreamer to be present.
   if (!getTargetStreamer())
     return;
 
-  // Emit ISA Version (NT_AMD_AMDGPU_ISA).
-  std::string ISAVersionString;
-  raw_string_ostream ISAVersionStream(ISAVersionString);
-  IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
-  getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+  if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+    // Emit ISA Version (NT_AMD_AMDGPU_ISA).
+    std::string ISAVersionString;
+    raw_string_ostream ISAVersionStream(ISAVersionString);
+    IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
+    getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+  }
 
   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
-    HSAMetadataStream.end();
-    getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata());
+    HSAMetadataStream->end();
+    bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
+    (void)Success;
+    assert(Success && "Malformed HSA Metadata");
   }
 
-  // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
-  if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
-    // Copy the PAL metadata from the map where we collected it into a vector,
-    // then write it as a .note.
-    PALMD::Metadata PALMetadataVector;
-    for (auto i : PALMetadataMap) {
-      PALMetadataVector.push_back(i.first);
-      PALMetadataVector.push_back(i.second);
+  if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+    // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
+    if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
+      // Copy the PAL metadata from the map where we collected it into a vector,
+      // then write it as a .note.
+      PALMD::Metadata PALMetadataVector;
+      for (auto i : PALMetadataMap) {
+        PALMetadataVector.push_back(i.first);
+        PALMetadataVector.push_back(i.second);
+      }
+      getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
     }
-    getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
   }
 }
 
@@ -193,21 +199,20 @@
   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
   if (!MFI.isEntryFunction())
     return;
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
 
-  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  amd_kernel_code_t KernelCode;
-  if (STM.isAmdCodeObjectV2(MF->getFunction())) {
-    getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
-    getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
+  if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+    const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+    amd_kernel_code_t KernelCode;
+    if (STM.isAmdCodeObjectV2(MF->getFunction())) {
+      getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
+      getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
+    }
   }
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     return;
 
-  HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
+  HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
Index: lib/Target/AMDGPU/AMDGPUPTNote.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUPTNote.h
+++ lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -23,7 +23,8 @@
 
 const char SectionName[] = ".note";
 
-const char NoteName[] = "AMD";
+const char NoteNameV2[] = "AMD";
+const char NoteNameV3[] = "AMDGPU";
 
 // TODO: Remove this file once we drop code object v2.
 enum NoteType{
Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
===================================================================
--- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2733,8 +2733,13 @@
 
   YamlStream.flush();
 
-  if (!getTargetStreamer().EmitHSAMetadata(HSAMetadataString))
-    return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  if (IsaInfo::hasCodeObjectV3(&getSTI())) {
+    if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
+      return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  } else {
+    if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
+      return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+  }
 
   return false;
 }
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h
@@ -17,8 +17,10 @@
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
 
 #include "AMDGPU.h"
+#include "AMDGPUTargetStreamer.h"
 #include "AMDKernelCodeT.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 
 namespace llvm {
@@ -34,7 +36,86 @@
 namespace AMDGPU {
 namespace HSAMD {
 
-class MetadataStreamer final {
+class MetadataStreamer {
+public:
+  virtual ~MetadataStreamer() {};
+
+  virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0;
+
+  virtual void begin(const Module &Mod) = 0;
+
+  virtual void end() = 0;
+
+  virtual void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) = 0;
+};
+
+class MetadataStreamerV3 final : public MetadataStreamer {
+private:
+  std::unique_ptr<msgpack::Node> HSAMetadataRoot = msgpack::makePtr<msgpack::MapNode<>>();
+  AMDGPUAS AMDGPUASI;
+
+  void dump(StringRef HSAMetadataString) const;
+
+  void verify(StringRef HSAMetadataString) const;
+
+  Optional<StringRef> getAccessQualifier(StringRef AccQual) const;
+
+  StringRef getAddressSpaceQualifer(unsigned AddressSpace) const;
+
+  StringRef getValueKind(Type *Ty, StringRef TypeQual,
+                         StringRef BaseTypeName) const;
+
+  StringRef getValueType(Type *Ty, StringRef TypeName) const;
+
+  std::string getTypeName(Type *Ty, bool Signed) const;
+
+  std::unique_ptr<msgpack::ArrayNode<>>
+  getWorkGroupDimensions(MDNode *Node) const;
+
+  std::unique_ptr<msgpack::MapNode<>>
+  getHSAKernelProps(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const;
+
+  void emitVersion();
+
+  void emitPrintf(const Module &Mod);
+
+  void emitKernelLanguage(const Function &Func, msgpack::MapNode<> &Kern);
+
+  void emitKernelAttrs(const Function &Func, msgpack::MapNode<> &Kern);
+
+  void emitKernelArgs(const Function &Func, msgpack::MapNode<> &Kern);
+
+  void emitKernelArg(const Argument &Arg, unsigned &Offset,
+                     msgpack::ArrayNode<> &Args);
+
+  void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
+                     unsigned &Offset, msgpack::ArrayNode<> &Args,
+                     StringRef Name = "", StringRef TypeName = "",
+                     StringRef BaseTypeName = "", StringRef AccQual = "",
+                     StringRef TypeQual = "");
+
+  std::unique_ptr<msgpack::Node> &getRootMetadata(StringRef Key) {
+    return (*cast<msgpack::MapNode<>>(HSAMetadataRoot.get()))[Key];
+  }
+
+  std::unique_ptr<msgpack::Node> &getHSAMetadataRoot() {
+    return HSAMetadataRoot;
+  }
+
+public:
+  MetadataStreamerV3() = default;
+  ~MetadataStreamerV3() = default;
+
+  bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
+
+  void begin(const Module &Mod) override;
+
+  void end() override;
+
+  void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) override;
+};
+
+class MetadataStreamerV2 final : public MetadataStreamer {
 private:
   Metadata HSAMetadata;
   AMDGPUAS AMDGPUASI;
@@ -80,19 +161,21 @@
                      StringRef BaseTypeName = "", StringRef AccQual = "",
                      StringRef TypeQual = "");
 
-public:
-  MetadataStreamer() = default;
-  ~MetadataStreamer() = default;
-
   const Metadata &getHSAMetadata() const {
     return HSAMetadata;
   }
 
-  void begin(const Module &Mod);
+public:
+  MetadataStreamerV2() = default;
+  ~MetadataStreamerV2() = default;
+
+  bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
+
+  void begin(const Module &Mod) override;
 
-  void end();
+  void end() override;
 
-  void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo);
+  void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) override;
 };
 
 } // end namespace HSAMD
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
@@ -35,11 +35,14 @@
 namespace AMDGPU {
 namespace HSAMD {
 
-void MetadataStreamer::dump(StringRef HSAMetadataString) const {
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV2
+//===----------------------------------------------------------------------===//
+void MetadataStreamerV2::dump(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
 }
 
-void MetadataStreamer::verify(StringRef HSAMetadataString) const {
+void MetadataStreamerV2::verify(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata Parser Test: ";
 
   HSAMD::Metadata FromHSAMetadataString;
@@ -62,7 +65,7 @@
   }
 }
 
-AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
+AccessQualifier MetadataStreamerV2::getAccessQualifier(StringRef AccQual) const {
   if (AccQual.empty())
     return AccessQualifier::Unknown;
 
@@ -73,7 +76,7 @@
              .Default(AccessQualifier::Default);
 }
 
-AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer(
+AddressSpaceQualifier MetadataStreamerV2::getAddressSpaceQualifer(
     unsigned AddressSpace) const {
   if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS)
     return AddressSpaceQualifier::Private;
@@ -91,7 +94,7 @@
   llvm_unreachable("Unknown address space qualifier");
 }
 
-ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
+ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual,
                                          StringRef BaseTypeName) const {
   if (TypeQual.find("pipe") != StringRef::npos)
     return ValueKind::Pipe;
@@ -119,7 +122,7 @@
                       ValueKind::ByValue);
 }
 
-ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
+ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
     auto Signed = !TypeName.startswith("u");
@@ -151,7 +154,7 @@
   }
 }
 
-std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
+std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
     if (!Signed)
@@ -188,7 +191,7 @@
   }
 }
 
-std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
+std::vector<uint32_t> MetadataStreamerV2::getWorkGroupDimensions(
     MDNode *Node) const {
   std::vector<uint32_t> Dims;
   if (Node->getNumOperands() != 3)
@@ -199,7 +202,7 @@
   return Dims;
 }
 
-Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
+Kernel::CodeProps::Metadata MetadataStreamerV2::getHSACodeProps(
     const MachineFunction &MF,
     const SIProgramInfo &ProgramInfo) const {
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
@@ -224,7 +227,7 @@
   return HSACodeProps;
 }
 
-Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
+Kernel::DebugProps::Metadata MetadataStreamerV2::getHSADebugProps(
     const MachineFunction &MF,
     const SIProgramInfo &ProgramInfo) const {
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
@@ -249,14 +252,14 @@
 }
 
 
-void MetadataStreamer::emitVersion() {
+void MetadataStreamerV2::emitVersion() {
   auto &Version = HSAMetadata.mVersion;
 
   Version.push_back(VersionMajor);
   Version.push_back(VersionMinor);
 }
 
-void MetadataStreamer::emitPrintf(const Module &Mod) {
+void MetadataStreamerV2::emitPrintf(const Module &Mod) {
   auto &Printf = HSAMetadata.mPrintf;
 
   auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
@@ -268,7 +271,7 @@
       Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
 }
 
-void MetadataStreamer::emitKernelLanguage(const Function &Func) {
+void MetadataStreamerV2::emitKernelLanguage(const Function &Func) {
   auto &Kernel = HSAMetadata.mKernels.back();
 
   // TODO: What about other languages?
@@ -286,7 +289,7 @@
       mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue());
 }
 
-void MetadataStreamer::emitKernelAttrs(const Function &Func) {
+void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
   auto &Attrs = HSAMetadata.mKernels.back().mAttrs;
 
   if (auto Node = Func.getMetadata("reqd_work_group_size"))
@@ -304,7 +307,7 @@
   }
 }
 
-void MetadataStreamer::emitKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
   for (auto &Arg : Func.args())
     emitKernelArg(Arg);
 
@@ -340,7 +343,7 @@
   }
 }
 
-void MetadataStreamer::emitKernelArg(const Argument &Arg) {
+void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
   auto Func = Arg.getParent();
   auto ArgNo = Arg.getArgNo();
   const MDNode *Node;
@@ -382,7 +385,7 @@
                 TypeName, BaseTypeName, AccQual, TypeQual);
 }
 
-void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
+void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
                                      ValueKind ValueKind, StringRef Name,
                                      StringRef TypeName, StringRef BaseTypeName,
                                      StringRef AccQual, StringRef TypeQual) {
@@ -423,13 +426,17 @@
   }
 }
 
-void MetadataStreamer::begin(const Module &Mod) {
+bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+  return TargetStreamer.EmitHSAMetadata(getHSAMetadata());
+}
+
+void MetadataStreamerV2::begin(const Module &Mod) {
   AMDGPUASI = getAMDGPUAS(Mod);
   emitVersion();
   emitPrintf(Mod);
 }
 
-void MetadataStreamer::end() {
+void MetadataStreamerV2::end() {
   std::string HSAMetadataString;
   if (toString(HSAMetadata, HSAMetadataString))
     return;
@@ -440,7 +447,7 @@
     verify(HSAMetadataString);
 }
 
-void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+void MetadataStreamerV2::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
   auto &Func = MF.getFunction();
   auto CodeProps = getHSACodeProps(MF, ProgramInfo);
   auto DebugProps = getHSADebugProps(MF, ProgramInfo);
@@ -460,6 +467,461 @@
   HSAMetadata.mKernels.back().mDebugProps = DebugProps;
 }
 
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV3
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerV3::dump(StringRef HSAMetadataString) const {
+  errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
+}
+
+void MetadataStreamerV3::verify(StringRef HSAMetadataString) const {
+  errs() << "AMDGPU HSA Metadata Parser Test: ";
+
+  std::unique_ptr<msgpack::Node> FromHSAMetadataString
+    = make_unique<msgpack::MapNode<>>();
+
+  yaml::Input YIn(HSAMetadataString);
+  YIn >> FromHSAMetadataString;
+  if (YIn.error()) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  std::string ToHSAMetadataString;
+  raw_string_ostream StrOS(ToHSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << FromHSAMetadataString;
+
+  errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL")
+         << '\n';
+  if (HSAMetadataString != ToHSAMetadataString) {
+    errs() << "Original input: " << HSAMetadataString << '\n'
+           << "Produced output: " << StrOS.str() << '\n';
+  }
+}
+
+Optional<StringRef> MetadataStreamerV3::getAccessQualifier(StringRef AccQual) const {
+  return StringSwitch<Optional<StringRef>>(AccQual)
+      .Case("read_only", StringRef(V3::Kernel::Arg::Access::ReadOnly))
+      .Case("write_only", StringRef(V3::Kernel::Arg::Access::WriteOnly))
+      .Case("read_write", StringRef(V3::Kernel::Arg::Access::ReadWrite))
+      .Default(None);
+}
+
+StringRef
+MetadataStreamerV3::getAddressSpaceQualifer(unsigned AddressSpace) const {
+  if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS)
+    return V3::Kernel::Arg::AddressSpace::Private;
+  if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS)
+    return V3::Kernel::Arg::AddressSpace::Global;
+  if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS)
+    return V3::Kernel::Arg::AddressSpace::Constant;
+  if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS)
+    return V3::Kernel::Arg::AddressSpace::Local;
+  if (AddressSpace == AMDGPUASI.FLAT_ADDRESS)
+    return V3::Kernel::Arg::AddressSpace::Generic;
+  if (AddressSpace == AMDGPUASI.REGION_ADDRESS)
+    return V3::Kernel::Arg::AddressSpace::Region;
+
+  llvm_unreachable("Unknown address space qualifier");
+}
+
+StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual,
+                                         StringRef BaseTypeName) const {
+  if (TypeQual.find("pipe") != StringRef::npos)
+    return V3::Kernel::Arg::ValueKind::Pipe;
+
+  return StringSwitch<StringRef>(BaseTypeName)
+      .Case("image1d_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image1d_array_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image1d_buffer_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image2d_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image2d_array_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image2d_array_depth_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image2d_array_msaa_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image2d_array_msaa_depth_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image2d_depth_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image2d_msaa_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image2d_msaa_depth_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("image3d_t", V3::Kernel::Arg::ValueKind::Image)
+      .Case("sampler_t", V3::Kernel::Arg::ValueKind::Sampler)
+      .Case("queue_t", V3::Kernel::Arg::ValueKind::Queue)
+      .Default(isa<PointerType>(Ty)
+                   ? (Ty->getPointerAddressSpace() == AMDGPUASI.LOCAL_ADDRESS
+                          ? V3::Kernel::Arg::ValueKind::DynamicSharedPointer
+                          : V3::Kernel::Arg::ValueKind::GlobalBuffer)
+                   : V3::Kernel::Arg::ValueKind::ByValue);
+}
+
+StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    auto Signed = !TypeName.startswith("u");
+    switch (Ty->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? V3::Kernel::Arg::ValueType::I8 : V3::Kernel::Arg::ValueType::U8;
+    case 16:
+      return Signed ? V3::Kernel::Arg::ValueType::I16 : V3::Kernel::Arg::ValueType::U16;
+    case 32:
+      return Signed ? V3::Kernel::Arg::ValueType::I32 : V3::Kernel::Arg::ValueType::U32;
+    case 64:
+      return Signed ? V3::Kernel::Arg::ValueType::I64 : V3::Kernel::Arg::ValueType::U64;
+    default:
+      return V3::Kernel::Arg::ValueType::Struct;
+    }
+  }
+  case Type::HalfTyID:
+    return V3::Kernel::Arg::ValueType::F16;
+  case Type::FloatTyID:
+    return V3::Kernel::Arg::ValueType::F32;
+  case Type::DoubleTyID:
+    return V3::Kernel::Arg::ValueType::F64;
+  case Type::PointerTyID:
+    return getValueType(Ty->getPointerElementType(), TypeName);
+  case Type::VectorTyID:
+    return getValueType(Ty->getVectorElementType(), TypeName);
+  default:
+    return V3::Kernel::Arg::ValueType::Struct;
+  }
+}
+
+std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    if (!Signed)
+      return (Twine('u') + getTypeName(Ty, true)).str();
+
+    auto BitWidth = Ty->getIntegerBitWidth();
+    switch (BitWidth) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BitWidth)).str();
+    }
+  }
+  case Type::HalfTyID:
+    return "half";
+  case Type::FloatTyID:
+    return "float";
+  case Type::DoubleTyID:
+    return "double";
+  case Type::VectorTyID: {
+    auto VecTy = cast<VectorType>(Ty);
+    auto ElTy = VecTy->getElementType();
+    auto NumElements = VecTy->getVectorNumElements();
+    return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
+  }
+  default:
+    return "unknown";
+  }
+}
+
+std::unique_ptr<msgpack::ArrayNode<>>
+MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const {
+  auto Dims = make_unique<msgpack::ArrayNode<>>();
+  if (Node->getNumOperands() != 3)
+    return Dims;
+
+  for (auto &Op : Node->operands())
+    Dims->push_back(make_unique<msgpack::ScalarNode>(
+        mdconst::extract<ConstantInt>(Op)->getZExtValue()));
+  return Dims;
+}
+
+void MetadataStreamerV3::emitVersion() {
+  auto Version = make_unique<msgpack::ArrayNode<>>();
+  Version->push_back(make_unique<msgpack::ScalarNode>(V3::VersionMajor));
+  Version->push_back(make_unique<msgpack::ScalarNode>(V3::VersionMinor));
+  getRootMetadata(V3::Key::Version) = std::move(Version);
+}
+
+void MetadataStreamerV3::emitPrintf(const Module &Mod) {
+  auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
+  if (!Node)
+    return;
+
+  auto Printf = make_unique<msgpack::ArrayNode<>>();
+  for (auto Op : Node->operands())
+    if (Op->getNumOperands())
+      Printf->push_back(make_unique<msgpack::ScalarNode>(cast<MDString>(Op->getOperand(0))->getString()));
+  getRootMetadata(V3::Key::Printf) = std::move(Printf);
+}
+
+void MetadataStreamerV3::emitKernelLanguage(const Function &Func,
+                                          msgpack::MapNode<> &Kern) {
+  // TODO: What about other languages?
+  auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
+  if (!Node || !Node->getNumOperands())
+    return;
+  auto Op0 = Node->getOperand(0);
+  if (Op0->getNumOperands() <= 1)
+    return;
+
+  Kern[V3::Kernel::Key::Language] =
+      make_unique<msgpack::ScalarNode>(V3::Kernel::Language::OpenCLC);
+  auto LanguageVersion = make_unique<msgpack::ArrayNode<>>();
+  LanguageVersion->push_back(make_unique<msgpack::ScalarNode>(
+      mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));
+  LanguageVersion->push_back(make_unique<msgpack::ScalarNode>(
+      mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));
+  Kern[V3::Kernel::Key::LanguageVersion] = std::move(LanguageVersion);
+}
+
+void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
+                                       msgpack::MapNode<> &Kern) {
+
+  if (auto Node = Func.getMetadata("reqd_work_group_size"))
+    Kern[V3::Kernel::Key::ReqdWorkGroupSize] = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("work_group_size_hint"))
+    Kern[V3::Kernel::Key::WorkGroupSizeHint] = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("vec_type_hint")) {
+    Kern[V3::Kernel::Key::VecTypeHint] = make_unique<msgpack::ScalarNode>(
+        getTypeName(cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+                    mdconst::extract<ConstantInt>(Node->getOperand(1))
+                        ->getZExtValue()));
+  }
+  if (Func.hasFnAttribute("runtime-handle")) {
+    Kern[V3::Kernel::Key::DeviceEnqueueSymbol] = make_unique<msgpack::ScalarNode>(
+        Func.getFnAttribute("runtime-handle").getValueAsString().str());
+  }
+}
+
+void MetadataStreamerV3::emitKernelArgs(const Function &Func,
+                                      msgpack::MapNode<> &Kern) {
+  unsigned Offset = 0;
+  auto Args = make_unique<msgpack::ArrayNode<>>();
+  for (auto &Arg : Func.args())
+    emitKernelArg(Arg, Offset, *Args);
+
+  // TODO: What about other languages?
+  if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) {
+    auto &DL = Func.getParent()->getDataLayout();
+    auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+    emitKernelArg(DL, Int64Ty, V3::Kernel::Arg::ValueKind::HiddenGlobalOffsetX,
+                  Offset, *Args);
+    emitKernelArg(DL, Int64Ty, V3::Kernel::Arg::ValueKind::HiddenGlobalOffsetY,
+                  Offset, *Args);
+    emitKernelArg(DL, Int64Ty, V3::Kernel::Arg::ValueKind::HiddenGlobalOffsetZ,
+                  Offset, *Args);
+
+    auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
+                                        AMDGPUASI.GLOBAL_ADDRESS);
+
+    // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+    // "none" argument.
+    if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+      emitKernelArg(DL, Int8PtrTy, V3::Kernel::Arg::ValueKind::HiddenPrintfBuffer,
+                    Offset, *Args);
+    else
+      emitKernelArg(DL, Int8PtrTy, V3::Kernel::Arg::ValueKind::HiddenNone, Offset,
+                    *Args);
+
+    // Emit "default queue" and "completion action" arguments if enqueue kernel is
+    // used, otherwise emit dummy "none" arguments.
+    if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+      emitKernelArg(DL, Int8PtrTy, V3::Kernel::Arg::ValueKind::HiddenDefaultQueue,
+                    Offset, *Args);
+      emitKernelArg(DL, Int8PtrTy, V3::Kernel::Arg::ValueKind::HiddenCompletionAction,
+                    Offset, *Args);
+    } else {
+      emitKernelArg(DL, Int8PtrTy, V3::Kernel::Arg::ValueKind::HiddenNone, Offset,
+                    *Args);
+      emitKernelArg(DL, Int8PtrTy, V3::Kernel::Arg::ValueKind::HiddenNone, Offset,
+                    *Args);
+    }
+  }
+
+  Kern[V3::Kernel::Key::Args] = std::move(Args);
+}
+
+void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
+                                     msgpack::ArrayNode<> &Args) {
+  auto Func = Arg.getParent();
+  auto ArgNo = Arg.getArgNo();
+  const MDNode *Node;
+
+  StringRef Name;
+  Node = Func->getMetadata("kernel_arg_name");
+  if (Node && ArgNo < Node->getNumOperands())
+    Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
+  else if (Arg.hasName())
+    Name = Arg.getName();
+
+  StringRef TypeName;
+  Node = Func->getMetadata("kernel_arg_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef BaseTypeName;
+  Node = Func->getMetadata("kernel_arg_base_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef AccQual;
+  if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() &&
+      Arg.hasNoAliasAttr()) {
+    AccQual = "read_only";
+  } else {
+    Node = Func->getMetadata("kernel_arg_access_qual");
+    if (Node && ArgNo < Node->getNumOperands())
+      AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+  }
+
+  StringRef TypeQual;
+  Node = Func->getMetadata("kernel_arg_type_qual");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
+                getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset,
+                Args, Name, TypeName, BaseTypeName, AccQual, TypeQual);
+}
+
+void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
+                                     StringRef ValueKind, unsigned &Offset,
+                                     msgpack::ArrayNode<> &Args, StringRef Name,
+                                     StringRef TypeName, StringRef BaseTypeName,
+                                     StringRef AccQual, StringRef TypeQual) {
+  auto ArgPtr = make_unique<msgpack::MapNode<>>();
+  auto &Arg = *ArgPtr;
+
+  if (!Name.empty())
+    Arg[V3::Kernel::Arg::Key::Name] = make_unique<msgpack::ScalarNode>(Name);
+  if (!TypeName.empty())
+    Arg[V3::Kernel::Arg::Key::TypeName] = make_unique<msgpack::ScalarNode>(TypeName);
+  auto Size = DL.getTypeAllocSize(Ty);
+  auto Align = DL.getABITypeAlignment(Ty);
+  Arg[V3::Kernel::Arg::Key::Size] = make_unique<msgpack::ScalarNode>(Size);
+  Offset = alignTo(Offset, Align);
+  Arg[V3::Kernel::Arg::Key::Offset] = make_unique<msgpack::ScalarNode>(Offset);
+  Offset += Size;
+  Arg[V3::Kernel::Arg::Key::ValueKind] =
+      make_unique<msgpack::ScalarNode>(ValueKind);
+  Arg[V3::Kernel::Arg::Key::ValueType] =
+      make_unique<msgpack::ScalarNode>(getValueType(Ty, BaseTypeName));
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    auto ElTy = PtrTy->getElementType();
+    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized())
+      Arg[V3::Kernel::Arg::Key::PointeeAlign] =
+          make_unique<msgpack::ScalarNode>(DL.getABITypeAlignment(ElTy));
+  }
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty))
+    Arg[V3::Kernel::Arg::Key::AddressSpace] = make_unique<msgpack::ScalarNode>(
+        getAddressSpaceQualifer(PtrTy->getAddressSpace()));
+
+  if (auto AQ = getAccessQualifier(AccQual))
+    Arg[V3::Kernel::Arg::Key::Access] = make_unique<msgpack::ScalarNode>(*AQ);
+
+  // TODO: Emit Arg[V3::Kernel::Arg::Key::ActualAccess].
+
+  SmallVector<StringRef, 1> SplitTypeQuals;
+  TypeQual.split(SplitTypeQuals, " ", -1, false);
+  for (StringRef Key : SplitTypeQuals) {
+    if (Key == "const")
+      Arg[V3::Kernel::Arg::Key::IsConst] = make_unique<msgpack::ScalarNode>(true);
+    else if (Key == "restrict")
+      Arg[V3::Kernel::Arg::Key::IsRestrict] =
+          make_unique<msgpack::ScalarNode>(true);
+    else if (Key == "volatile")
+      Arg[V3::Kernel::Arg::Key::IsVolatile] =
+          make_unique<msgpack::ScalarNode>(true);
+    else if (Key == "pipe")
+      Arg[V3::Kernel::Arg::Key::IsPipe] = make_unique<msgpack::ScalarNode>(true);
+  }
+
+  Args.push_back(std::move(ArgPtr));
+}
+
+std::unique_ptr<msgpack::MapNode<>>
+MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
+                                      const SIProgramInfo &ProgramInfo) const {
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  auto HSAKernelProps = make_unique<msgpack::MapNode<>>();
+  auto &Kern = *HSAKernelProps;
+
+  Kern[V3::Kernel::Key::KernargSegmentSize] = make_unique<msgpack::ScalarNode>(
+      STM.getKernArgSegmentSize(MF.getFunction(), MFI.getABIArgOffset()));
+  Kern[V3::Kernel::Key::GroupSegmentFixedSize] =
+      make_unique<msgpack::ScalarNode>(ProgramInfo.LDSSize);
+  Kern[V3::Kernel::Key::PrivateSegmentFixedSize] =
+      make_unique<msgpack::ScalarNode>(ProgramInfo.ScratchSize);
+  Kern[V3::Kernel::Key::KernargSegmentAlign] = make_unique<msgpack::ScalarNode>(
+      std::max(uint32_t(4), MFI.getMaxKernArgAlign()));
+  Kern[V3::Kernel::Key::WavefrontSize] =
+      make_unique<msgpack::ScalarNode>(STM.getWavefrontSize());
+  Kern[V3::Kernel::Key::SGPRCount] =
+      make_unique<msgpack::ScalarNode>(ProgramInfo.NumSGPR);
+  Kern[V3::Kernel::Key::VGPRCount] =
+      make_unique<msgpack::ScalarNode>(ProgramInfo.NumVGPR);
+  Kern[V3::Kernel::Key::MaxFlatWorkGroupSize] =
+      make_unique<msgpack::ScalarNode>(MFI.getMaxFlatWorkGroupSize());
+  Kern[V3::Kernel::Key::SGPRSpillCount] =
+      make_unique<msgpack::ScalarNode>(MFI.getNumSpilledSGPRs());
+  Kern[V3::Kernel::Key::VGPRSpillCount] =
+      make_unique<msgpack::ScalarNode>(MFI.getNumSpilledVGPRs());
+
+  return HSAKernelProps;
+}
+
+
+bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+  return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true);
+}
+
+void MetadataStreamerV3::begin(const Module &Mod) {
+  AMDGPUASI = getAMDGPUAS(Mod);
+  emitVersion();
+  emitPrintf(Mod);
+  getRootMetadata(V3::Key::Kernels).reset(new msgpack::ArrayNode<>());
+}
+
+void MetadataStreamerV3::end() {
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << HSAMetadataRoot;
+
+  if (DumpHSAMetadata)
+    dump(StrOS.str());
+  if (VerifyHSAMetadata)
+    verify(StrOS.str());
+}
+
+void MetadataStreamerV3::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+  auto &Func = MF.getFunction();
+  auto KernelProps = getHSAKernelProps(MF, ProgramInfo);
+
+  if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+    return;
+
+  auto &KernelsNode = getRootMetadata(V3::Key::Kernels);
+  auto Kernels = cast<msgpack::ArrayNode<>>(KernelsNode.get());
+
+  {
+    auto &Kern = *KernelProps;
+    Kern[V3::Kernel::Key::Name] = make_unique<msgpack::ScalarNode>(Func.getName());
+    Kern[V3::Kernel::Key::Symbol] = make_unique<msgpack::ScalarNode>(
+        (Twine(Func.getName()) + Twine(".kd")).str());
+    emitKernelLanguage(Func, Kern);
+    emitKernelAttrs(Func, Kern);
+    emitKernelArgs(Func, Kern);
+  }
+
+  Kernels->push_back(std::move(KernelProps));
+}
+
 } // end namespace HSAMD
 } // end namespace AMDGPU
 } // end namespace llvm
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
 #include "AMDKernelCodeT.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
@@ -56,7 +57,20 @@
   virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
 
   /// \returns True on success, false on failure.
-  virtual bool EmitHSAMetadata(StringRef HSAMetadataString);
+  virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
+
+  /// \returns True on success, false on failure.
+  virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString);
+
+  /// Emit HSA Metadata
+  ///
+  /// When \p Strict is true, known metadata elements must already be
+  /// well-typed. When \p Strict is false, known types are inferred and
+  /// the \p HSAMetadata structure is updated with the correct types.
+  ///
+  /// \returns True on success, false on failure.
+  virtual bool EmitHSAMetadata(std::unique_ptr<msgpack::Node> &HSAMetadata,
+                               bool Strict) = 0;
 
   /// \returns True on success, false on failure.
   virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
@@ -88,6 +102,10 @@
   bool EmitISAVersion(StringRef IsaVersionString) override;
 
   /// \returns True on success, false on failure.
+  bool EmitHSAMetadata(std::unique_ptr<msgpack::Node> &HSAMetadata,
+                       bool Strict) override;
+
+  /// \returns True on success, false on failure.
   bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
 
   /// \returns True on success, false on failure.
@@ -101,8 +119,8 @@
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   MCStreamer &Streamer;
 
-  void EmitAMDGPUNote(const MCExpr *DescSize, unsigned NoteType,
-                      function_ref<void(MCELFStreamer &)> EmitDesc);
+  void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
+                function_ref<void(MCELFStreamer &)> EmitDesc);
 
 public:
   AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
@@ -124,6 +142,10 @@
   bool EmitISAVersion(StringRef IsaVersionString) override;
 
   /// \returns True on success, false on failure.
+  bool EmitHSAMetadata(std::unique_ptr<msgpack::Node> &HSAMetadata,
+                       bool Strict) override;
+
+  /// \returns True on success, false on failure.
   bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
 
   /// \returns True on success, false on failure.
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -17,7 +17,9 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
@@ -34,6 +36,7 @@
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::HSAMD;
 
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetStreamer
@@ -117,7 +120,7 @@
   return Entry->Name;
 }
 
-bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
+bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
   HSAMD::Metadata HSAMetadata;
   if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
     return false;
@@ -125,6 +128,15 @@
   return EmitHSAMetadata(HSAMetadata);
 }
 
+bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
+  std::unique_ptr<msgpack::Node> HSAMetadataRoot;
+  yaml::Input YIn(HSAMetadataString);
+  YIn >> HSAMetadataRoot;
+  if (YIn.error())
+    return false;
+  return EmitHSAMetadata(HSAMetadataRoot, false);
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetAsmStreamer
 //===----------------------------------------------------------------------===//
@@ -180,9 +192,26 @@
   if (HSAMD::toString(HSAMetadata, HSAMetadataString))
     return false;
 
-  OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
+  OS << '\t' << AssemblerDirectiveBegin << '\n';
   OS << HSAMetadataString << '\n';
-  OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
+  OS << '\t' << AssemblerDirectiveEnd << '\n';
+  return true;
+}
+
+bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
+    std::unique_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+  V3::MetadataVerifier Verifier(Strict);
+  if (!Verifier.verify(HSAMetadataRoot))
+    return false;
+
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << HSAMetadataRoot;
+
+  OS << '\t' << AssemblerDirectiveBegin << '\n';
+  OS << StrOS.str() << '\n';
+  OS << '\t' << AssemblerDirectiveEnd << '\n';
   return true;
 }
 
@@ -226,13 +255,13 @@
   return static_cast<MCELFStreamer &>(Streamer);
 }
 
-void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
-    const MCExpr *DescSZ, unsigned NoteType,
+void AMDGPUTargetELFStreamer::EmitNote(
+    StringRef Name, const MCExpr *DescSZ, unsigned NoteType,
     function_ref<void(MCELFStreamer &)> EmitDesc) {
   auto &S = getStreamer();
   auto &Context = S.getContext();
 
-  auto NameSZ = sizeof(ElfNote::NoteName);
+  auto NameSZ = Name.size() + 1;
 
   S.PushSection();
   S.SwitchSection(Context.getELFSection(
@@ -240,7 +269,7 @@
   S.EmitIntValue(NameSZ, 4);                                  // namesz
   S.EmitValue(DescSZ, 4);                                     // descz
   S.EmitIntValue(NoteType, 4);                                // type
-  S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ));          // name
+  S.EmitBytes(Name);                                          // name
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
   EmitDesc(S);                                                // desc
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
@@ -251,7 +280,7 @@
 AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
                                                            uint32_t Minor) {
 
-  EmitAMDGPUNote(
+  EmitNote(ElfNote::NoteNameV2,
     MCConstantExpr::create(8, getContext()),
     ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
     [&](MCELFStreamer &OS){
@@ -274,7 +303,7 @@
     sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
     VendorNameSize + ArchNameSize;
 
-  EmitAMDGPUNote(
+  EmitNote(ElfNote::NoteNameV2,
     MCConstantExpr::create(DescSZ, getContext()),
     ElfNote::NT_AMDGPU_HSA_ISA,
     [&](MCELFStreamer &OS) {
@@ -317,7 +346,7 @@
     MCSymbolRefExpr::create(DescEnd, Context),
     MCSymbolRefExpr::create(DescBegin, Context), Context);
 
-  EmitAMDGPUNote(
+  EmitNote(ElfNote::NoteNameV2,
     DescSZ,
     ELF::NT_AMD_AMDGPU_ISA,
     [&](MCELFStreamer &OS) {
@@ -330,6 +359,38 @@
 }
 
 bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
+    std::unique_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+  V3::MetadataVerifier Verifier(Strict);
+  if (!Verifier.verify(HSAMetadataRoot))
+    return false;
+
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  msgpack::Writer MPWriter(StrOS);
+  HSAMetadataRoot->write(MPWriter);
+
+  // Create two labels to mark the beginning and end of the desc field
+  // and a MCExpr to calculate the size of the desc field.
+  auto &Context = getContext();
+  auto *DescBegin = Context.createTempSymbol();
+  auto *DescEnd = Context.createTempSymbol();
+  auto *DescSZ = MCBinaryExpr::createSub(
+    MCSymbolRefExpr::create(DescEnd, Context),
+    MCSymbolRefExpr::create(DescBegin, Context), Context);
+
+  EmitNote(ElfNote::NoteNameV3,
+    DescSZ,
+    ELF::NT_AMDGPU_METADATA,
+    [&](MCELFStreamer &OS) {
+      OS.EmitLabel(DescBegin);
+      OS.EmitBytes(StrOS.str());
+      OS.EmitLabel(DescEnd);
+    }
+  );
+  return true;
+}
+
+bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
     const AMDGPU::HSAMD::Metadata &HSAMetadata) {
   std::string HSAMetadataString;
   if (HSAMD::toString(HSAMetadata, HSAMetadataString))
@@ -344,7 +405,7 @@
     MCSymbolRefExpr::create(DescEnd, Context),
     MCSymbolRefExpr::create(DescBegin, Context), Context);
 
-  EmitAMDGPUNote(
+  EmitNote(ElfNote::NoteNameV2,
     DescSZ,
     ELF::NT_AMD_AMDGPU_HSA_METADATA,
     [&](MCELFStreamer &OS) {
@@ -358,7 +419,7 @@
 
 bool AMDGPUTargetELFStreamer::EmitPALMetadata(
     const PALMD::Metadata &PALMetadata) {
-  EmitAMDGPUNote(
+  EmitNote(ElfNote::NoteNameV2,
     MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), getContext()),
     ELF::NT_AMD_AMDGPU_PAL_METADATA,
     [&](MCELFStreamer &OS){
Index: test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
@@ -0,0 +1,145 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+code-object-v3 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -amdgpu-verify-hsa-metadata -filetype=obj -mattr=+code-object-v3 -o /dev/null < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK-LABEL: {{^}}min_64_max_64:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_64_max_64() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
+
+; CHECK-LABEL: {{^}}min_64_max_128:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_64_max_128() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
+
+; CHECK-LABEL: {{^}}min_128_max_128:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_128_max_128() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
+
+; CHECK-LABEL: {{^}}min_1024_max_2048
+; CHECK: SGPRBlocks: 1
+; CHECK: VGPRBlocks: 7
+; CHECK: NumSGPRsForWavesPerEU: 12
+; CHECK: NumVGPRsForWavesPerEU: 32
+@var = addrspace(1) global float 0.0
+define amdgpu_kernel void @min_1024_max_2048() #3 {
+  %val0 = load volatile float, float addrspace(1)* @var
+  %val1 = load volatile float, float addrspace(1)* @var
+  %val2 = load volatile float, float addrspace(1)* @var
+  %val3 = load volatile float, float addrspace(1)* @var
+  %val4 = load volatile float, float addrspace(1)* @var
+  %val5 = load volatile float, float addrspace(1)* @var
+  %val6 = load volatile float, float addrspace(1)* @var
+  %val7 = load volatile float, float addrspace(1)* @var
+  %val8 = load volatile float, float addrspace(1)* @var
+  %val9 = load volatile float, float addrspace(1)* @var
+  %val10 = load volatile float, float addrspace(1)* @var
+  %val11 = load volatile float, float addrspace(1)* @var
+  %val12 = load volatile float, float addrspace(1)* @var
+  %val13 = load volatile float, float addrspace(1)* @var
+  %val14 = load volatile float, float addrspace(1)* @var
+  %val15 = load volatile float, float addrspace(1)* @var
+  %val16 = load volatile float, float addrspace(1)* @var
+  %val17 = load volatile float, float addrspace(1)* @var
+  %val18 = load volatile float, float addrspace(1)* @var
+  %val19 = load volatile float, float addrspace(1)* @var
+  %val20 = load volatile float, float addrspace(1)* @var
+  %val21 = load volatile float, float addrspace(1)* @var
+  %val22 = load volatile float, float addrspace(1)* @var
+  %val23 = load volatile float, float addrspace(1)* @var
+  %val24 = load volatile float, float addrspace(1)* @var
+  %val25 = load volatile float, float addrspace(1)* @var
+  %val26 = load volatile float, float addrspace(1)* @var
+  %val27 = load volatile float, float addrspace(1)* @var
+  %val28 = load volatile float, float addrspace(1)* @var
+  %val29 = load volatile float, float addrspace(1)* @var
+  %val30 = load volatile float, float addrspace(1)* @var
+  %val31 = load volatile float, float addrspace(1)* @var
+  %val32 = load volatile float, float addrspace(1)* @var
+  %val33 = load volatile float, float addrspace(1)* @var
+  %val34 = load volatile float, float addrspace(1)* @var
+  %val35 = load volatile float, float addrspace(1)* @var
+  %val36 = load volatile float, float addrspace(1)* @var
+  %val37 = load volatile float, float addrspace(1)* @var
+  %val38 = load volatile float, float addrspace(1)* @var
+  %val39 = load volatile float, float addrspace(1)* @var
+  %val40 = load volatile float, float addrspace(1)* @var
+
+  store volatile float %val0, float addrspace(1)* @var
+  store volatile float %val1, float addrspace(1)* @var
+  store volatile float %val2, float addrspace(1)* @var
+  store volatile float %val3, float addrspace(1)* @var
+  store volatile float %val4, float addrspace(1)* @var
+  store volatile float %val5, float addrspace(1)* @var
+  store volatile float %val6, float addrspace(1)* @var
+  store volatile float %val7, float addrspace(1)* @var
+  store volatile float %val8, float addrspace(1)* @var
+  store volatile float %val9, float addrspace(1)* @var
+  store volatile float %val10, float addrspace(1)* @var
+  store volatile float %val11, float addrspace(1)* @var
+  store volatile float %val12, float addrspace(1)* @var
+  store volatile float %val13, float addrspace(1)* @var
+  store volatile float %val14, float addrspace(1)* @var
+  store volatile float %val15, float addrspace(1)* @var
+  store volatile float %val16, float addrspace(1)* @var
+  store volatile float %val17, float addrspace(1)* @var
+  store volatile float %val18, float addrspace(1)* @var
+  store volatile float %val19, float addrspace(1)* @var
+  store volatile float %val20, float addrspace(1)* @var
+  store volatile float %val21, float addrspace(1)* @var
+  store volatile float %val22, float addrspace(1)* @var
+  store volatile float %val23, float addrspace(1)* @var
+  store volatile float %val24, float addrspace(1)* @var
+  store volatile float %val25, float addrspace(1)* @var
+  store volatile float %val26, float addrspace(1)* @var
+  store volatile float %val27, float addrspace(1)* @var
+  store volatile float %val28, float addrspace(1)* @var
+  store volatile float %val29, float addrspace(1)* @var
+  store volatile float %val30, float addrspace(1)* @var
+  store volatile float %val31, float addrspace(1)* @var
+  store volatile float %val32, float addrspace(1)* @var
+  store volatile float %val33, float addrspace(1)* @var
+  store volatile float %val34, float addrspace(1)* @var
+  store volatile float %val35, float addrspace(1)* @var
+  store volatile float %val36, float addrspace(1)* @var
+  store volatile float %val37, float addrspace(1)* @var
+  store volatile float %val38, float addrspace(1)* @var
+  store volatile float %val39, float addrspace(1)* @var
+  store volatile float %val40, float addrspace(1)* @var
+
+  ret void
+}
+attributes #3 = {"amdgpu-flat-work-group-size"="1024,2048"}
+
+; CHECK: amdhsa.kernels:
+; CHECK:   .max_flat_workgroup_size: 64
+; CHECK:   .name:                 min_64_max_64
+; CHECK:   .max_flat_workgroup_size: 128
+; CHECK:   .name:                 min_64_max_128
+; CHECK:   .max_flat_workgroup_size: 128
+; CHECK:   .name:                 min_128_max_128
+; CHECK:   .max_flat_workgroup_size: 2048
+; CHECK:   .name:                 min_1024_max_2048
+; CHECK: amdhsa.version:
+; CHECK:   - 1
+; CHECK:   - 0
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
Index: test/CodeGen/AMDGPU/code-object-v3.ll
===================================================================
--- test/CodeGen/AMDGPU/code-object-v3.ll
+++ test/CodeGen/AMDGPU/code-object-v3.ll
@@ -14,7 +14,6 @@
 ; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_version
 ; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_isa
 ; OSABI-AMDHSA-ASM-NOT: .amd_amdgpu_isa
-; OSABI-AMDHSA-ASM-NOT: .amd_amdgpu_hsa_metadata
 ; OSABI-AMDHSA-ASM-NOT: .amd_amdgpu_pal_metadata
 
 ; OSABI-AMDHSA-ELF: Section Headers
@@ -31,8 +30,6 @@
 ; OSABI-AMDHSA-ELF: {{[0-9]+}}: 0000000000000000 64         OBJECT GLOBAL DEFAULT {{[0-9]+}} fadd.kd
 ; OSABI-AMDHSA-ELF: {{[0-9]+}}: 0000000000000040 64         OBJECT GLOBAL DEFAULT {{[0-9]+}} fsub.kd
 
-; OSABI-AMDHSA-ELF-NOT: Displaying notes found
-
 define amdgpu_kernel void @fadd(
     float addrspace(1)* %r,
     float addrspace(1)* %a,
Index: test/CodeGen/AMDGPU/hsa-metadata-deduce-ro-arg-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-deduce-ro-arg-v3.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s
+
+; CHECK:        .symbol:          test_ro_arg.kd
+; CHECK:        .name:            test_ro_arg
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:       'float*'
+; CHECK-NEXT:       .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            in
+; CHECK-NEXT:       .access:          read_only
+; CHECK-NEXT:       .offset:          0
+; CHECK-NEXT:       .is_const:        true
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .is_restrict:     true
+; CHECK-NEXT:       .value_type:      f32
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .type_name:       'float*'
+; CHECK-NEXT:       .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            out
+; CHECK-NEXT:       .offset:          8
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      f32
+; CHECK-NEXT:       .address_space:   global
+
+define amdgpu_kernel void @test_ro_arg(float addrspace(1)* noalias readonly %in, float addrspace(1)* %out)
+    !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2
+    !kernel_arg_base_type !2 !kernel_arg_type_qual !3 {
+  ret void
+}
+
+!0 = !{i32 1, i32 1}
+!1 = !{!"none", !"none"}
+!2 = !{!"float*", !"float*"}
+!3 = !{!"const restrict", !""}
Index: test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel-v3.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+; CHECK:        .symbol:          test_non_enqueue_kernel_caller.kd
+; CHECK:        .name:            test_non_enqueue_kernel_caller
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:     char
+; CHECK-NEXT:       .value_kind:    by_value
+; CHECK-NEXT:       .offset:        0
+; CHECK-NEXT:       .size:          1
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .name:          a
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_x
+; CHECK-NEXT:       .offset:        8
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_y
+; CHECK-NEXT:       .offset:        16
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_z
+; CHECK-NEXT:       .offset:        24
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NOT:        .value_kind:    hidden_default_queue
+; CHECK-NOT:        .value_kind:    hidden_completion_action
+define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_enqueue_kernel_caller.kd
+; CHECK:        .name:            test_enqueue_kernel_caller
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:     char
+; CHECK-NEXT:       .value_kind:    by_value
+; CHECK-NEXT:       .offset:        0
+; CHECK-NEXT:       .size:          1
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .name:          a
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_x
+; CHECK-NEXT:       .offset:        8
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_y
+; CHECK-NEXT:       .offset:        16
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_global_offset_z
+; CHECK-NEXT:       .offset:        24
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i64
+; CHECK-NEXT:     - .value_kind:    hidden_none
+; CHECK-NEXT:       .offset:        32
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .address_space: global
+; CHECK-NEXT:     - .value_kind:    hidden_default_queue
+; CHECK-NEXT:       .offset:        40
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .address_space: global
+; CHECK-NEXT:     - .value_kind:    hidden_completion_action
+; CHECK-NEXT:       .offset:        48
+; CHECK-NEXT:       .size:          8
+; CHECK-NEXT:       .value_type:    i8
+; CHECK-NEXT:       .address_space: global
+define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK-NOT:  amdhsa.printf:
+
+attributes #0 = { "calls-enqueue-kernel" }
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"char"}
+!4 = !{!""}
+
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
Index: test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
@@ -0,0 +1,1441 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+%struct.A = type { i8, float }
+%opencl.image1d_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+%opencl.queue_t = type opaque
+%opencl.pipe_t = type opaque
+%struct.B = type { i32 addrspace(1)*}
+%opencl.clk_event_t = type opaque
+
+@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+; CHECK:        .symbol:          test_char.kd
+; CHECK:        .name:            test_char
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      char
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           1
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NOT:        .value_kind:     hidden_default_queue
+; CHECK-NOT:        .value_kind:     hidden_completion_action
+define amdgpu_kernel void @test_char(i8 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
+    !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_ushort2.kd
+; CHECK:        .name:            test_ushort2
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      ushort2
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     u16
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_ushort2(<2 x i16> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10
+    !kernel_arg_base_type !10 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_int3.kd
+; CHECK:        .name:            test_int3
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int3
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           16
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_int3(<3 x i32> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11
+    !kernel_arg_base_type !11 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_ulong4.kd
+; CHECK:        .name:            test_ulong4
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      ulong4
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           32
+; CHECK-NEXT:       .value_type:     u64
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         56
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_ulong4(<4 x i64> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12
+    !kernel_arg_base_type !12 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_half8.kd
+; CHECK:        .name:            test_half8
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      half8
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           16
+; CHECK-NEXT:       .value_type:     f16
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_half8(<8 x half> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13
+    !kernel_arg_base_type !13 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_float16.kd
+; CHECK:        .name:            test_float16
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      float16
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           64
+; CHECK-NEXT:       .value_type:     f32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         64
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         72
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         80
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         88
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_float16(<16 x float> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14
+    !kernel_arg_base_type !14 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_double16.kd
+; CHECK:        .name:            test_double16
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      double16
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           128
+; CHECK-NEXT:       .value_type:     f64
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         128
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         136
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         144
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         152
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_double16(<16 x double> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15
+    !kernel_arg_base_type !15 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_pointer.kd
+; CHECK:        .name:            test_pointer
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16
+    !kernel_arg_base_type !16 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_image.kd
+; CHECK:        .name:            test_image
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      image2d_t
+; CHECK-NEXT:       .value_kind:     image
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17
+    !kernel_arg_base_type !17 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_sampler.kd
+; CHECK:        .name:            test_sampler
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      sampler_t
+; CHECK-NEXT:       .value_kind:     sampler
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_sampler(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18
+    !kernel_arg_base_type !18 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_queue.kd
+; CHECK:        .name:            test_queue
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      queue_t
+; CHECK-NEXT:       .value_kind:     queue
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19
+    !kernel_arg_base_type !19 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_struct.kd
+; CHECK:        .name:            test_struct
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      struct A
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space: private
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_struct(%struct.A addrspace(5)* byval %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20
+    !kernel_arg_base_type !20 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_i128.kd
+; CHECK:        .name:            test_i128
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      i128
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           16
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_i128(i128 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21
+    !kernel_arg_base_type !21 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_multi_arg.kd
+; CHECK:        .name:            test_multi_arg
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .type_name:      short2
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         4
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i16
+; CHECK-NEXT:       .name:           b
+; CHECK-NEXT:     - .type_name:      char3
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .name:           c
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c)
+    !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24
+    !kernel_arg_base_type !24 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_addr_space.kd
+; CHECK:        .name:            test_addr_space
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           g
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           c
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space: constant
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           l
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .pointee_align:  4
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g,
+                                           i32 addrspace(4)* %c,
+                                           i32 addrspace(3)* %l)
+    !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51
+    !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_type_qual.kd
+; CHECK:        .name:            test_type_qual
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .is_volatile:    true
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           b
+; CHECK-NEXT:       .is_const:       true
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .is_restrict:    true
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     pipe
+; CHECK-NEXT:       .name:           c
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .is_pipe:        true
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a,
+                                          i32 addrspace(1)* %b,
+                                          %opencl.pipe_t addrspace(1)* %c)
+    !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51
+    !kernel_arg_base_type !51 !kernel_arg_type_qual !70 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_access_qual.kd
+; CHECK:        .name:            test_access_qual
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      image1d_t
+; CHECK-NEXT:       .value_kind:     image
+; CHECK-NEXT:       .name:           ro
+; CHECK-NEXT:       .access:         read_only
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      image2d_t
+; CHECK-NEXT:       .value_kind:     image
+; CHECK-NEXT:       .name:           wo
+; CHECK-NEXT:       .access:         write_only
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      image3d_t
+; CHECK-NEXT:       .value_kind:     image
+; CHECK-NEXT:       .name:           rw
+; CHECK-NEXT:       .access:         read_write
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro,
+                                            %opencl.image2d_t addrspace(1)* %wo,
+                                            %opencl.image3d_t addrspace(1)* %rw)
+    !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62
+    !kernel_arg_base_type !62 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_half.kd
+; CHECK:        .name:            test_vec_type_hint_half
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   half
+define amdgpu_kernel void @test_vec_type_hint_half(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_float.kd
+; CHECK:        .name:            test_vec_type_hint_float
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   float
+define amdgpu_kernel void @test_vec_type_hint_float(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_double.kd
+; CHECK:        .name:            test_vec_type_hint_double
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   double
+define amdgpu_kernel void @test_vec_type_hint_double(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_char.kd
+; CHECK:        .name:            test_vec_type_hint_char
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   char
+define amdgpu_kernel void @test_vec_type_hint_char(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_short.kd
+; CHECK:        .name:            test_vec_type_hint_short
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   short
+define amdgpu_kernel void @test_vec_type_hint_short(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_long.kd
+; CHECK:        .name:            test_vec_type_hint_long
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   long
+define amdgpu_kernel void @test_vec_type_hint_long(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_vec_type_hint_unknown.kd
+; CHECK:        .name:            test_vec_type_hint_unknown
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      int
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK:        .vec_type_hint:   unknown
+define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 {
+  ret void
+}
+
+; CHECK:        .reqd_workgroup_size:
+; CHECK-NEXT:     - 1
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 4
+; CHECK:        .symbol:          test_reqd_wgs_vec_type_hint.kd
+; CHECK:        .name:            test_reqd_wgs_vec_type_hint
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:          int
+; CHECK-NEXT:       .value_kind:         by_value
+; CHECK-NEXT:       .offset:             0
+; CHECK-NEXT:       .size:               4
+; CHECK-NEXT:       .value_type:         i32
+; CHECK-NEXT:       .name:               a
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_x
+; CHECK-NEXT:       .offset:             8
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_y
+; CHECK-NEXT:       .offset:             16
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_z
+; CHECK-NEXT:       .offset:             24
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_printf_buffer
+; CHECK-NEXT:       .offset:             32
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i8
+; CHECK-NEXT:       .address_space:      global
+; CHECK:        .vec_type_hint:       int
+define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5
+    !reqd_work_group_size !6 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_wgs_hint_vec_type_hint.kd
+; CHECK:        .workgroup_size_hint:
+; CHECK-NEXT:     - 8
+; CHECK-NEXT:     - 16
+; CHECK-NEXT:     - 32
+; CHECK:        .name:            test_wgs_hint_vec_type_hint
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:          int
+; CHECK-NEXT:       .value_kind:         by_value
+; CHECK-NEXT:       .offset:             0
+; CHECK-NEXT:       .size:               4
+; CHECK-NEXT:       .value_type:         i32
+; CHECK-NEXT:       .name:               a
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_x
+; CHECK-NEXT:       .offset:             8
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_y
+; CHECK-NEXT:       .offset:             16
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_global_offset_z
+; CHECK-NEXT:       .offset:             24
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i64
+; CHECK-NEXT:     - .value_kind:         hidden_printf_buffer
+; CHECK-NEXT:       .offset:             32
+; CHECK-NEXT:       .size:               8
+; CHECK-NEXT:       .value_type:         i8
+; CHECK-NEXT:       .address_space:      global
+; CHECK:        .vec_type_hint:       uint4
+define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7
+    !work_group_size_hint !8 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_arg_ptr_to_ptr.kd
+; CHECK:        .name:            test_arg_ptr_to_ptr
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'int  addrspace(5)* addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 addrspace(5)* addrspace(1)* %a)
+    !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80
+    !kernel_arg_base_type !80 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_arg_struct_contains_ptr.kd
+; CHECK:        .name:            test_arg_struct_contains_ptr
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      struct B
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space: private
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B addrspace(5)* byval %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82
+    !kernel_arg_base_type !82 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK:        .symbol:          test_arg_vector_of_ptr.kd
+; CHECK:        .name:            test_arg_vector_of_ptr
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'global int addrspace(5)* __attribute__((ext_vector_type(2)))'
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           16
+; CHECK-NEXT:       .value_type:     i32
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83
+    !kernel_arg_base_type !83 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_arg_unknown_builtin_type.kd
+; CHECK:        .name:            test_arg_unknown_builtin_type
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      clk_event_t
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_arg_unknown_builtin_type(
+    %opencl.clk_event_t addrspace(1)* %a)
+    !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84
+    !kernel_arg_base_type !84 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_pointee_align.kd
+; CHECK:        .name:            test_pointee_align
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      'long  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     global_buffer
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .type_name:      'char  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           b
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  1
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char2  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           c
+; CHECK-NEXT:       .offset:         12
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  2
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char3  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           d
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  4
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char4  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           e
+; CHECK-NEXT:       .offset:         20
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  4
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char8  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           f
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  8
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .type_name:      'char16  addrspace(5)*'
+; CHECK-NEXT:       .value_kind:     dynamic_shared_pointer
+; CHECK-NEXT:       .name:           g
+; CHECK-NEXT:       .offset:         28
+; CHECK-NEXT:       .size:           4
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .pointee_align:  16
+; CHECK-NEXT:       .address_space: local
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         56
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
+                                              i8 addrspace(3)* %b,
+                                              <2 x i8> addrspace(3)* %c,
+                                              <3 x i8> addrspace(3)* %d,
+                                              <4 x i8> addrspace(3)* %e,
+                                              <8 x i8> addrspace(3)* %f,
+                                              <16 x i8> addrspace(3)* %g)
+    !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93
+    !kernel_arg_base_type !93 !kernel_arg_type_qual !94 {
+  ret void
+}
+
+; CHECK:        .symbol:          __test_block_invoke_kernel.kd
+; CHECK:        .device_enqueue_symbol: __test_block_invoke_kernel_runtime_handle
+; CHECK:        .name:            __test_block_invoke_kernel
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      __block_literal
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           25
+; CHECK-NEXT:       .value_type:     struct
+; CHECK-NEXT:       .name:           arg
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         56
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @__test_block_invoke_kernel(
+    <{ i32, i32, i8*, i8 addrspace(1)*, i8 }> %arg) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110
+    !kernel_arg_base_type !110 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:        .symbol:          test_enqueue_kernel_caller.kd
+; CHECK:        .name:            test_enqueue_kernel_caller
+; CHECK:        .language:        OpenCL C
+; CHECK:        .language_version:
+; CHECK-NEXT:     - 2
+; CHECK-NEXT:     - 0
+; CHECK:        .args:
+; CHECK-NEXT:     - .type_name:      char
+; CHECK-NEXT:       .value_kind:     by_value
+; CHECK-NEXT:       .offset:         0
+; CHECK-NEXT:       .size:           1
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .name:           a
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:       .offset:         8
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:       .offset:         16
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:       .offset:         24
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i64
+; CHECK-NEXT:     - .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:       .offset:         32
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_default_queue
+; CHECK-NEXT:       .offset:         40
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+; CHECK-NEXT:     - .value_kind:     hidden_completion_action
+; CHECK-NEXT:       .offset:         48
+; CHECK-NEXT:       .size:           8
+; CHECK-NEXT:       .value_type:     i8
+; CHECK-NEXT:       .address_space:  global
+define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
+    !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK:  amdhsa.printf:
+; CHECK-NEXT: - '1:1:4:%d\n'
+; CHECK-NEXT: - '2:1:8:%g\n'
+
+attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
+attributes #1 = { "calls-enqueue-kernel" }
+
+!llvm.printf.fmts = !{!100, !101}
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"int"}
+!4 = !{!""}
+!5 = !{i32 undef, i32 1}
+!6 = !{i32 1, i32 2, i32 4}
+!7 = !{<4 x i32> undef, i32 0}
+!8 = !{i32 8, i32 16, i32 32}
+!9 = !{!"char"}
+!10 = !{!"ushort2"}
+!11 = !{!"int3"}
+!12 = !{!"ulong4"}
+!13 = !{!"half8"}
+!14 = !{!"float16"}
+!15 = !{!"double16"}
+!16 = !{!"int  addrspace(5)*"}
+!17 = !{!"image2d_t"}
+!18 = !{!"sampler_t"}
+!19 = !{!"queue_t"}
+!20 = !{!"struct A"}
+!21 = !{!"i128"}
+!22 = !{i32 0, i32 0, i32 0}
+!23 = !{!"none", !"none", !"none"}
+!24 = !{!"int", !"short2", !"char3"}
+!25 = !{!"", !"", !""}
+!26 = !{half undef, i32 1}
+!27 = !{float undef, i32 1}
+!28 = !{double undef, i32 1}
+!29 = !{i8 undef, i32 1}
+!30 = !{i16 undef, i32 1}
+!31 = !{i64 undef, i32 1}
+!32 = !{i32  addrspace(5)*undef, i32 1}
+!50 = !{i32 1, i32 2, i32 3}
+!51 = !{!"int  addrspace(5)*", !"int  addrspace(5)*", !"int  addrspace(5)*"}
+!60 = !{i32 1, i32 1, i32 1}
+!61 = !{!"read_only", !"write_only", !"read_write"}
+!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"}
+!70 = !{!"volatile", !"const restrict", !"pipe"}
+!80 = !{!"int  addrspace(5)* addrspace(5)*"}
+!81 = !{i32 1}
+!82 = !{!"struct B"}
+!83 = !{!"global int addrspace(5)* __attribute__((ext_vector_type(2)))"}
+!84 = !{!"clk_event_t"}
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3}
+!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"}
+!93 = !{!"long  addrspace(5)*", !"char  addrspace(5)*", !"char2  addrspace(5)*", !"char3  addrspace(5)*", !"char4  addrspace(5)*", !"char8  addrspace(5)*", !"char16  addrspace(5)*"}
+!94 = !{!"", !"", !"", !"", !"", !"", !""}
+!100 = !{!"1:1:4:%d\5Cn"}
+!101 = !{!"2:1:8:%g\5Cn"}
+!110 = !{!"__block_literal"}
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
Index: test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+; CHECK:        .symbol:     test.kd
+; CHECK:        .name:       test
+; CHECK:        .args:
+; CHECK-NEXT:     - .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            r
+; CHECK-NEXT:       .offset:          0
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      f16
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            a
+; CHECK-NEXT:       .offset:          8
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      f16
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      global_buffer
+; CHECK-NEXT:       .name:            b
+; CHECK-NEXT:       .offset:          16
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      f16
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      hidden_global_offset_x
+; CHECK-NEXT:       .offset:          24
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i64
+; CHECK-NEXT:     - .value_kind:      hidden_global_offset_y
+; CHECK-NEXT:       .offset:          32
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i64
+; CHECK-NEXT:     - .value_kind:      hidden_global_offset_z
+; CHECK-NEXT:       .offset:          40
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i64
+; CHECK-NEXT:     - .value_kind:      hidden_none
+; CHECK-NEXT:       .offset:          48
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i8
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      hidden_none
+; CHECK-NEXT:       .offset:          56
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i8
+; CHECK-NEXT:       .address_space:   global
+; CHECK-NEXT:     - .value_kind:      hidden_none
+; CHECK-NEXT:       .offset:          64
+; CHECK-NEXT:       .size:            8
+; CHECK-NEXT:       .value_type:      i8
+; CHECK-NEXT:       .address_space:   global
+define amdgpu_kernel void @test(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+
+!opencl.ocl.version = !{!0}
+!0 = !{i32 2, i32 0}
Index: test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
@@ -0,0 +1,95 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX802 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+%opencl.image1d_t = type opaque
+%opencl.image1d_array_t = type opaque
+%opencl.image1d_buffer_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image2d_array_t = type opaque
+%opencl.image2d_array_depth_t = type opaque
+%opencl.image2d_array_msaa_t = type opaque
+%opencl.image2d_array_msaa_depth_t = type opaque
+%opencl.image2d_depth_t = type opaque
+%opencl.image2d_msaa_t = type opaque
+%opencl.image2d_msaa_depth_t = type opaque
+%opencl.image3d_t = type opaque
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+; CHECK:      .symbol:     test.kd
+; CHECK:      .name:       test
+; CHECK:      .args:
+; CHECK:        - .type_name:  image1d_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       a
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image1d_array_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       b
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image1d_buffer_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       c
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       d
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_array_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       e
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_array_depth_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       f
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_array_msaa_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       g
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_array_msaa_depth_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       h
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_depth_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       i
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_msaa_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       j
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image2d_msaa_depth_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       k
+; CHECK:          .size:       8
+; CHECK:        - .type_name:  image3d_t
+; CHECK:          .value_kind: image
+; CHECK:          .name:       l
+; CHECK:          .size:       8
+define amdgpu_kernel void @test(%opencl.image1d_t addrspace(1)* %a,
+                                %opencl.image1d_array_t addrspace(1)* %b,
+                                %opencl.image1d_buffer_t addrspace(1)* %c,
+                                %opencl.image2d_t addrspace(1)* %d,
+                                %opencl.image2d_array_t addrspace(1)* %e,
+                                %opencl.image2d_array_depth_t addrspace(1)* %f,
+                                %opencl.image2d_array_msaa_t addrspace(1)* %g,
+                                %opencl.image2d_array_msaa_depth_t addrspace(1)* %h,
+                                %opencl.image2d_depth_t addrspace(1)* %i,
+                                %opencl.image2d_msaa_t addrspace(1)* %j,
+                                %opencl.image2d_msaa_depth_t addrspace(1)* %k,
+                                %opencl.image3d_t addrspace(1)* %l)
+    !kernel_arg_type !1 !kernel_arg_base_type !1 {
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+
+!1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t",
+       !"image2d_t", !"image2d_array_t", !"image2d_array_depth_t",
+       !"image2d_array_msaa_t", !"image2d_array_msaa_depth_t",
+       !"image2d_depth_t", !"image2d_msaa_t", !"image2d_msaa_depth_t",
+       !"image3d_t"}
Index: test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK: ...
+
+!opencl.ocl.version = !{}
Index: test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-2-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-2-v3.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK: ...
+
+!opencl.ocl.version = !{!0}
+!0 = !{}
Index: test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+; CHECK: ...
+
+!opencl.ocl.version = !{!0}
+!0 = !{i32 1}
Index: test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -0,0 +1,140 @@
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=NOTES %s
+
+@var = addrspace(1) global float 0.0
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+
+; CHECK: - .max_flat_workgroup_size:    256
+; CHECK:   .kernarg_segment_size:       24
+; CHECK:   .private_segment_fixed_size: 0
+; CHECK:   .wavefront_size:             64
+; CHECK:   .symbol:     test.kd
+; CHECK:   .name:       test
+; CHECK:   .sgpr_count:                 6
+; CHECK:   .kernarg_segment_align:      8
+; CHECK:   .vgpr_count:                 3
+; CHECK:   .group_segment_fixed_size:   0
+define amdgpu_kernel void @test(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; CHECK:   .symbol:     num_spilled_sgprs.kd
+; CHECK:   .name:       num_spilled_sgprs
+; CHECK:   .sgpr_spill_count: 41
+define amdgpu_kernel void @num_spilled_sgprs(
+    i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %out2,
+    i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, i32 addrspace(1)* %out5,
+    i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, i32 addrspace(1)* %out8,
+    i32 addrspace(1)* %out9, i32 addrspace(1)* %outa, i32 addrspace(1)* %outb,
+    i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, i32 addrspace(1)* %oute,
+    i32 addrspace(1)* %outf, i32 %in0, i32 %in1, i32 %in2, i32 %in3, i32 %in4,
+    i32 %in5, i32 %in6, i32 %in7, i32 %in8, i32 %in9, i32 %ina, i32 %inb,
+    i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
+entry:
+  store i32 %in0, i32 addrspace(1)* %out0
+  store i32 %in1, i32 addrspace(1)* %out1
+  store i32 %in2, i32 addrspace(1)* %out2
+  store i32 %in3, i32 addrspace(1)* %out3
+  store i32 %in4, i32 addrspace(1)* %out4
+  store i32 %in5, i32 addrspace(1)* %out5
+  store i32 %in6, i32 addrspace(1)* %out6
+  store i32 %in7, i32 addrspace(1)* %out7
+  store i32 %in8, i32 addrspace(1)* %out8
+  store i32 %in9, i32 addrspace(1)* %out9
+  store i32 %ina, i32 addrspace(1)* %outa
+  store i32 %inb, i32 addrspace(1)* %outb
+  store i32 %inc, i32 addrspace(1)* %outc
+  store i32 %ind, i32 addrspace(1)* %outd
+  store i32 %ine, i32 addrspace(1)* %oute
+  store i32 %inf, i32 addrspace(1)* %outf
+  ret void
+}
+
+; CHECK:   .symbol:     num_spilled_vgprs.kd
+; CHECK:   .name:       num_spilled_vgprs
+; CHECK:   .vgpr_spill_count: 14
+define amdgpu_kernel void @num_spilled_vgprs() #1 {
+  %val0 = load volatile float, float addrspace(1)* @var
+  %val1 = load volatile float, float addrspace(1)* @var
+  %val2 = load volatile float, float addrspace(1)* @var
+  %val3 = load volatile float, float addrspace(1)* @var
+  %val4 = load volatile float, float addrspace(1)* @var
+  %val5 = load volatile float, float addrspace(1)* @var
+  %val6 = load volatile float, float addrspace(1)* @var
+  %val7 = load volatile float, float addrspace(1)* @var
+  %val8 = load volatile float, float addrspace(1)* @var
+  %val9 = load volatile float, float addrspace(1)* @var
+  %val10 = load volatile float, float addrspace(1)* @var
+  %val11 = load volatile float, float addrspace(1)* @var
+  %val12 = load volatile float, float addrspace(1)* @var
+  %val13 = load volatile float, float addrspace(1)* @var
+  %val14 = load volatile float, float addrspace(1)* @var
+  %val15 = load volatile float, float addrspace(1)* @var
+  %val16 = load volatile float, float addrspace(1)* @var
+  %val17 = load volatile float, float addrspace(1)* @var
+  %val18 = load volatile float, float addrspace(1)* @var
+  %val19 = load volatile float, float addrspace(1)* @var
+  %val20 = load volatile float, float addrspace(1)* @var
+  %val21 = load volatile float, float addrspace(1)* @var
+  %val22 = load volatile float, float addrspace(1)* @var
+  %val23 = load volatile float, float addrspace(1)* @var
+  %val24 = load volatile float, float addrspace(1)* @var
+  %val25 = load volatile float, float addrspace(1)* @var
+  %val26 = load volatile float, float addrspace(1)* @var
+  %val27 = load volatile float, float addrspace(1)* @var
+  %val28 = load volatile float, float addrspace(1)* @var
+  %val29 = load volatile float, float addrspace(1)* @var
+  %val30 = load volatile float, float addrspace(1)* @var
+
+  store volatile float %val0, float addrspace(1)* @var
+  store volatile float %val1, float addrspace(1)* @var
+  store volatile float %val2, float addrspace(1)* @var
+  store volatile float %val3, float addrspace(1)* @var
+  store volatile float %val4, float addrspace(1)* @var
+  store volatile float %val5, float addrspace(1)* @var
+  store volatile float %val6, float addrspace(1)* @var
+  store volatile float %val7, float addrspace(1)* @var
+  store volatile float %val8, float addrspace(1)* @var
+  store volatile float %val9, float addrspace(1)* @var
+  store volatile float %val10, float addrspace(1)* @var
+  store volatile float %val11, float addrspace(1)* @var
+  store volatile float %val12, float addrspace(1)* @var
+  store volatile float %val13, float addrspace(1)* @var
+  store volatile float %val14, float addrspace(1)* @var
+  store volatile float %val15, float addrspace(1)* @var
+  store volatile float %val16, float addrspace(1)* @var
+  store volatile float %val17, float addrspace(1)* @var
+  store volatile float %val18, float addrspace(1)* @var
+  store volatile float %val19, float addrspace(1)* @var
+  store volatile float %val20, float addrspace(1)* @var
+  store volatile float %val21, float addrspace(1)* @var
+  store volatile float %val22, float addrspace(1)* @var
+  store volatile float %val23, float addrspace(1)* @var
+  store volatile float %val24, float addrspace(1)* @var
+  store volatile float %val25, float addrspace(1)* @var
+  store volatile float %val26, float addrspace(1)* @var
+  store volatile float %val27, float addrspace(1)* @var
+  store volatile float %val28, float addrspace(1)* @var
+  store volatile float %val29, float addrspace(1)* @var
+  store volatile float %val30, float addrspace(1)* @var
+
+  ret void
+}
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
+
+attributes #0 = { "amdgpu-num-sgpr"="14" }
+attributes #1 = { "amdgpu-num-vgpr"="20" }
Index: test/MC/AMDGPU/hsa-metadata-kernel-args-v3.s
===================================================================
--- /dev/null
+++ test/MC/AMDGPU/hsa-metadata-kernel-args-v3.s
@@ -0,0 +1,96 @@
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amd_amdgpu_hsa_metadata
+// CHECK:    amdhsa.kernels:
+// CHECK-NEXT:      - .max_flat_workgroup_size: 256
+// CHECK-NEXT:        .wavefront_size: 128
+// CHECK-NEXT:        .symbol:      'test_kernel@kd'
+// CHECK-NEXT:        .kernarg_segment_size: 8
+// CHECK-NEXT:        .private_segment_fixed_size: 32
+// CHECK-NEXT:        .name:            test_kernel
+// CHECK-NEXT:        .language:        OpenCL C
+// CHECK-NEXT:        .sgpr_count: 14
+// CHECK-NEXT:        .kernarg_segment_align: 64
+// CHECK-NEXT:        .vgpr_count: 40
+// CHECK-NEXT:        .group_segment_fixed_size: 16
+// CHECK-NEXT:        .language_version:
+// CHECK-NEXT:          - 2
+// CHECK-NEXT:          - 0
+// CHECK-NEXT:        .args:
+// CHECK-NEXT:          - .type_name:      char
+// CHECK-NEXT:            .value_kind:     by_value
+// CHECK-NEXT:            .offset:         1
+// CHECK-NEXT:            .size:          1
+// CHECK-NEXT:            .value_type:     i8
+// CHECK-NEXT:          - .value_kind:     hidden_global_offset_x
+// CHECK-NEXT:            .offset:         8
+// CHECK-NEXT:            .size:          8
+// CHECK-NEXT:            .value_type:     i64
+// CHECK-NEXT:          - .value_kind:     hidden_global_offset_y
+// CHECK-NEXT:            .offset:         8
+// CHECK-NEXT:            .size:          8
+// CHECK-NEXT:            .value_type:     i64
+// CHECK-NEXT:          - .value_kind:     hidden_global_offset_z
+// CHECK-NEXT:            .offset:         8
+// CHECK-NEXT:            .size:          8
+// CHECK-NEXT:            .value_type:     i64
+// CHECK-NEXT:          - .value_kind:     hidden_printf_buffer
+// CHECK-NEXT:            .offset:         8
+// CHECK-NEXT:            .size:          8
+// CHECK-NEXT:            .value_type:     i8
+// CHECK-NEXT:            .address_space: global
+// CHECK:    amdhsa.version:
+// CHECK-NEXT:       - 1
+// CHECK-NEXT:       - 0
+// CHECK:    amdhsa.printf:
+// CHECK-NEXT:      - '1:1:4:%d\n'
+// CHECK-NEXt:      - '2:1:8:%g\n'
+// CHECK:  .end_amd_amdgpu_hsa_metadata
+.amd_amdgpu_hsa_metadata
+  amdhsa.version:
+    - 1
+    - 0
+  amdhsa.printf:
+    - '1:1:4:%d\n'
+    - '2:1:8:%g\n'
+  amdhsa.kernels:
+    - .name:            test_kernel
+      .symbol:      test_kernel@kd
+      .language:        OpenCL C
+      .language_version:
+        - 2
+        - 0
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+      .args:
+        - .type_name:      char
+          .size:          1
+          .offset:         1
+          .value_kind:     by_value
+          .value_type:     i8
+        - .size:          8
+          .offset:         8
+          .value_kind:     hidden_global_offset_x
+          .value_type:     i64
+        - .size:          8
+          .offset:         8
+          .value_kind:     hidden_global_offset_y
+          .value_type:     i64
+        - .size:          8
+          .offset:         8
+          .value_kind:     hidden_global_offset_z
+          .value_type:     i64
+        - .size:          8
+          .offset:         8
+          .value_kind:     hidden_printf_buffer
+          .value_type:     i8
+          .address_space: global
+.end_amd_amdgpu_hsa_metadata
Index: test/MC/AMDGPU/hsa-metadata-kernel-attrs-v3.s
===================================================================
--- /dev/null
+++ test/MC/AMDGPU/hsa-metadata-kernel-attrs-v3.s
@@ -0,0 +1,67 @@
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amd_amdgpu_hsa_metadata
+// CHECK:    amdhsa.kernels:
+// CHECK:      - .max_flat_workgroup_size: 256
+// CHECK:        .wavefront_size: 128
+// CHECK:        .symbol:      'test_kernel@kd'
+// CHECK:        .reqd_workgroup_size:
+// CHECK-NEXT:     - 1
+// CHECK-NEXT:     - 2
+// CHECK-NEXT:     - 4
+// CHECK:        .kernarg_segment_size: 8
+// CHECK:        .private_segment_fixed_size: 32
+// CHECK:        .workgroup_size_hint:
+// CHECK-NEXT:     - 8
+// CHECK-NEXT:     - 16
+// CHECK-NEXT:     - 32
+// CHECK:        .name:            test_kernel
+// CHECK:        .language:        OpenCL C
+// CHECK:        .sgpr_count:      14
+// CHECK:        .kernarg_segment_align: 64
+// CHECK:        .vgpr_count:      40
+// CHECK:        .language_version:
+// CHECK-NEXT:     - 2
+// CHECK-NEXT:     - 0
+// CHECK:        .vec_type_hint:       int
+// CHECK:    amdhsa.version:
+// CHECK-NEXT: - 1
+// CHECK-NEXT: - 0
+// CHECK:    amdhsa.printf:
+// CHECK:      - '1:1:4:%d\n'
+// CHECK:      - '2:1:8:%g\n'
+// CHECK: .end_amd_amdgpu_hsa_metadata
+.amd_amdgpu_hsa_metadata
+  amdhsa.version:
+    - 1
+    - 0
+  amdhsa.printf:
+    - '1:1:4:%d\n'
+    - '2:1:8:%g\n'
+  amdhsa.kernels:
+    - .name:            test_kernel
+      .symbol:      test_kernel@kd
+      .language:        OpenCL C
+      .language_version:
+        - 2
+        - 0
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+      .reqd_workgroup_size:
+        - 1
+        - 2
+        - 4
+      .workgroup_size_hint:
+        - 8
+        - 16
+        - 32
+      .vec_type_hint:       int
+.end_amd_amdgpu_hsa_metadata
Index: test/MC/AMDGPU/hsa-metadata-kernel-code-props-v3.s
===================================================================
--- /dev/null
+++ test/MC/AMDGPU/hsa-metadata-kernel-code-props-v3.s
@@ -0,0 +1,42 @@
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -mattr=+code-object-v3 -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amd_amdgpu_hsa_metadata
+// CHECK:    amdhsa.kernels:
+// CHECK:      - .sgpr_count: 40
+// CHECK:        .max_flat_workgroup_size:    256
+// CHECK:        .symbol: 'test_kernel@kd'
+// CHECK:        .kernarg_segment_size:      24
+// CHECK:        .group_segment_fixed_size:   24
+// CHECK:        .private_segment_fixed_size: 16
+// CHECK:        .vgpr_count: 14
+// CHECK:        .vgpr_spill_count: 1
+// CHECK:        .kernarg_segment_align:     16
+// CHECK:        .sgpr_spill_count: 1
+// CHECK:        .wavefront_size:           64
+// CHECK:        .name:       test_kernel
+// CHECK:    amdhsa.version:
+// CHECK-NEXT: - 1
+// CHECK-NEXT: - 0
+.amd_amdgpu_hsa_metadata
+  amdhsa.version:
+    - 1
+    - 0
+  amdhsa.printf:
+    - '1:1:4:%d\n'
+    - '2:1:8:%g\n'
+  amdhsa.kernels:
+    - .name:            test_kernel
+      .symbol:      test_kernel@kd
+      .kernarg_segment_size:      24
+      .group_segment_fixed_size:   24
+      .private_segment_fixed_size: 16
+      .kernarg_segment_align:     16
+      .wavefront_size:           64
+      .max_flat_workgroup_size:    256
+      .sgpr_count:               40
+      .vgpr_count:               14
+      .sgpr_spill_count:         1
+      .vgpr_spill_count:         1
+.end_amd_amdgpu_hsa_metadata
Index: test/MC/AMDGPU/hsa-v3.s
===================================================================
--- /dev/null
+++ test/MC/AMDGPU/hsa-v3.s
@@ -0,0 +1,310 @@
+// RUN: llvm-mc -mattr=+code-object-v3 -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -mattr=+code-object-v3 -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+
+// ELF: Section {
+// ELF: Name: .text
+// ELF: Type: SHT_PROGBITS (0x1)
+// ELF: Flags [ (0x6)
+// ELF: SHF_ALLOC (0x2)
+// ELF: SHF_EXECINSTR (0x4)
+
+// ELF: SHT_NOTE
+// ELF: 0000: 04000000 08000000 01000000 414D4400
+// ELF: 0010: 02000000 00000000 04000000 1B000000
+// ELF: 0020: 03000000 414D4400 04000700 07000000
+// ELF: 0030: 00000000 00000000 414D4400 414D4447
+// ELF: 0040: 50550000
+// We can't check binary representation of metadata note: it is different on
+// Windows and Linux because of carriage return on Windows
+
+// ELF: Symbol {
+// ELF: Name: amd_kernel_code_t_minimal
+// ELF: Type: AMDGPU_HSA_KERNEL (0xA)
+// ELF: Section: .text
+// ELF: }
+// ELF: Symbol {
+// ELF: Name: amd_kernel_code_t_test_all
+// ELF: Type: AMDGPU_HSA_KERNEL (0xA)
+// ELF: Section: .text
+// ELF: }
+
+.text
+// ASM: .text
+
+.hsa_code_object_version 2,0
+// ASM: .hsa_code_object_version 2,0
+
+.hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
+// ASM: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
+
+.amd_amdgpu_hsa_metadata
+  amdhsa.version:
+    - 3
+    - 0
+  amdhsa.kernels:
+    - .name:       amd_kernel_code_t_test_all
+      .symbol: amd_kernel_code_t_test_all@kd
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+    - .name:       amd_kernel_code_t_minimal
+      .symbol: amd_kernel_code_t_minimal@kd
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+.end_amd_amdgpu_hsa_metadata
+
+// ASM: .amd_amdgpu_hsa_metadata
+// ASM:    amdhsa.kernels:
+// ASM:  - .sgpr_count:     14
+// ASM:    .max_flat_workgroup_size: 256
+// ASM:    .symbol:         'amd_kernel_code_t_test_all@kd'
+// ASM:    .kernarg_segment_size: 8
+// ASM:    .group_segment_fixed_size: 16
+// ASM:    .private_segment_fixed_size: 32
+// ASM:    .vgpr_count:     40
+// ASM:    .kernarg_segment_align: 64
+// ASM:    .wavefront_size: 128
+// ASM:    .name:           amd_kernel_code_t_test_all
+// ASM:  - .sgpr_count:     14
+// ASM:    .max_flat_workgroup_size: 256
+// ASM:    .symbol:         'amd_kernel_code_t_minimal@kd'
+// ASM:    .kernarg_segment_size: 8
+// ASM:    .group_segment_fixed_size: 16
+// ASM:    .private_segment_fixed_size: 32
+// ASM:    .vgpr_count:     40
+// ASM:    .kernarg_segment_align: 64
+// ASM:    .wavefront_size: 128
+// ASM:    .name:           amd_kernel_code_t_minimal
+// ASM:    amdhsa.version:
+// ASM-NEXT: - 3
+// ASM-NEXT: - 0
+// ASM: .end_amd_amdgpu_hsa_metadata
+
+.amdgpu_hsa_kernel amd_kernel_code_t_test_all
+.amdgpu_hsa_kernel amd_kernel_code_t_minimal
+
+amd_kernel_code_t_test_all:
+; Test all amd_kernel_code_t members with non-default values.
+.amd_kernel_code_t
+    kernel_code_version_major = 100
+    kernel_code_version_minor = 100
+    machine_kind = 0
+    machine_version_major = 5
+    machine_version_minor = 5
+    machine_version_stepping = 5
+    kernel_code_entry_byte_offset = 512
+    kernel_code_prefetch_byte_size = 1
+    max_scratch_backing_memory_byte_size = 1
+    compute_pgm_rsrc1_vgprs = 1
+    compute_pgm_rsrc1_sgprs = 1
+    compute_pgm_rsrc1_priority = 1
+    compute_pgm_rsrc1_float_mode = 1
+    compute_pgm_rsrc1_priv = 1
+    compute_pgm_rsrc1_dx10_clamp = 1
+    compute_pgm_rsrc1_debug_mode = 1
+    compute_pgm_rsrc1_ieee_mode = 1
+    compute_pgm_rsrc2_scratch_en = 1
+    compute_pgm_rsrc2_user_sgpr = 1
+    compute_pgm_rsrc2_tgid_x_en = 1
+    compute_pgm_rsrc2_tgid_y_en = 1
+    compute_pgm_rsrc2_tgid_z_en = 1
+    compute_pgm_rsrc2_tg_size_en = 1
+    compute_pgm_rsrc2_tidig_comp_cnt = 1
+    compute_pgm_rsrc2_excp_en_msb = 1
+    compute_pgm_rsrc2_lds_size = 1
+    compute_pgm_rsrc2_excp_en = 1
+    enable_sgpr_private_segment_buffer = 1
+    enable_sgpr_dispatch_ptr = 1
+    enable_sgpr_queue_ptr = 1
+    enable_sgpr_kernarg_segment_ptr = 1
+    enable_sgpr_dispatch_id = 1
+    enable_sgpr_flat_scratch_init = 1
+    enable_sgpr_private_segment_size = 1
+    enable_sgpr_grid_workgroup_count_x = 1
+    enable_sgpr_grid_workgroup_count_y = 1
+    enable_sgpr_grid_workgroup_count_z = 1
+    enable_ordered_append_gds = 1
+    private_element_size = 1
+    is_ptr64 = 1
+    is_dynamic_callstack = 1
+    is_debug_enabled = 1
+    is_xnack_enabled = 1
+    workitem_private_segment_byte_size = 1
+    workgroup_group_segment_byte_size = 1
+    gds_segment_byte_size = 1
+    kernarg_segment_byte_size = 1
+    workgroup_fbarrier_count = 1
+    wavefront_sgpr_count = 1
+    workitem_vgpr_count = 1
+    reserved_vgpr_first = 1
+    reserved_vgpr_count = 1
+    reserved_sgpr_first = 1
+    reserved_sgpr_count = 1
+    debug_wavefront_private_segment_offset_sgpr = 1
+    debug_private_segment_buffer_sgpr = 1
+    kernarg_segment_alignment = 5
+    group_segment_alignment = 5
+    private_segment_alignment = 5
+    wavefront_size = 5
+    call_convention = 1
+    runtime_loader_kernel_symbol = 1
+.end_amd_kernel_code_t
+
+// ASM-LABEL: {{^}}amd_kernel_code_t_test_all:
+// ASM: .amd_kernel_code_t
+// ASM: amd_code_version_major = 100
+// ASM: amd_code_version_minor = 100
+// ASM: amd_machine_kind = 0
+// ASM: amd_machine_version_major = 5
+// ASM: amd_machine_version_minor = 5
+// ASM: amd_machine_version_stepping = 5
+// ASM: kernel_code_entry_byte_offset = 512
+// ASM: kernel_code_prefetch_byte_size = 1
+// ASM: granulated_workitem_vgpr_count = 1
+// ASM: granulated_wavefront_sgpr_count = 1
+// ASM: priority = 1
+// ASM: float_mode = 1
+// ASM: priv = 1
+// ASM: enable_dx10_clamp = 1
+// ASM: debug_mode = 1
+// ASM: enable_ieee_mode = 1
+// ASM: enable_sgpr_private_segment_wave_byte_offset = 1
+// ASM: user_sgpr_count = 1
+// ASM: enable_sgpr_workgroup_id_x = 1
+// ASM: enable_sgpr_workgroup_id_y = 1
+// ASM: enable_sgpr_workgroup_id_z = 1
+// ASM: enable_sgpr_workgroup_info = 1
+// ASM: enable_vgpr_workitem_id = 1
+// ASM: enable_exception_msb = 1
+// ASM: granulated_lds_size = 1
+// ASM: enable_exception = 1
+// ASM: enable_sgpr_private_segment_buffer = 1
+// ASM: enable_sgpr_dispatch_ptr = 1
+// ASM: enable_sgpr_queue_ptr = 1
+// ASM: enable_sgpr_kernarg_segment_ptr = 1
+// ASM: enable_sgpr_dispatch_id = 1
+// ASM: enable_sgpr_flat_scratch_init = 1
+// ASM: enable_sgpr_private_segment_size = 1
+// ASM: enable_sgpr_grid_workgroup_count_x = 1
+// ASM: enable_sgpr_grid_workgroup_count_y = 1
+// ASM: enable_sgpr_grid_workgroup_count_z = 1
+// ASM: enable_ordered_append_gds = 1
+// ASM: private_element_size = 1
+// ASM: is_ptr64 = 1
+// ASM: is_dynamic_callstack = 1
+// ASM: is_debug_enabled = 1
+// ASM: is_xnack_enabled = 1
+// ASM: workitem_private_segment_byte_size = 1
+// ASM: workgroup_group_segment_byte_size = 1
+// ASM: gds_segment_byte_size = 1
+// ASM: kernarg_segment_byte_size = 1
+// ASM: workgroup_fbarrier_count = 1
+// ASM: wavefront_sgpr_count = 1
+// ASM: workitem_vgpr_count = 1
+// ASM: reserved_vgpr_first = 1
+// ASM: reserved_vgpr_count = 1
+// ASM: reserved_sgpr_first = 1
+// ASM: reserved_sgpr_count = 1
+// ASM: debug_wavefront_private_segment_offset_sgpr = 1
+// ASM: debug_private_segment_buffer_sgpr = 1
+// ASM: kernarg_segment_alignment = 5
+// ASM: group_segment_alignment = 5
+// ASM: private_segment_alignment = 5
+// ASM: wavefront_size = 5
+// ASM: call_convention = 1
+// ASM: runtime_loader_kernel_symbol = 1
+// ASM: .end_amd_kernel_code_t
+
+amd_kernel_code_t_minimal:
+.amd_kernel_code_t
+	enable_sgpr_kernarg_segment_ptr = 1
+	is_ptr64 = 1
+	granulated_workitem_vgpr_count = 1
+	granulated_wavefront_sgpr_count = 1
+	user_sgpr_count = 2
+	kernarg_segment_byte_size = 16
+	wavefront_sgpr_count = 8
+//      wavefront_sgpr_count = 7
+;	wavefront_sgpr_count = 7
+// Make sure a blank line won't break anything:
+
+// Make sure a line with whitespace won't break anything:
+
+	workitem_vgpr_count = 16
+.end_amd_kernel_code_t
+
+// ASM-LABEL: {{^}}amd_kernel_code_t_minimal:
+// ASM: .amd_kernel_code_t
+// ASM:	amd_code_version_major = 1
+// ASM:	amd_code_version_minor = 2
+// ASM:	amd_machine_kind = 1
+// ASM:	amd_machine_version_major = 7
+// ASM:	amd_machine_version_minor = 0
+// ASM:	amd_machine_version_stepping = 0
+// ASM:	kernel_code_entry_byte_offset = 256
+// ASM:	kernel_code_prefetch_byte_size = 0
+// ASM: granulated_workitem_vgpr_count = 1
+// ASM: granulated_wavefront_sgpr_count = 1
+// ASM: priority = 0
+// ASM: float_mode = 0
+// ASM: priv = 0
+// ASM: enable_dx10_clamp = 0
+// ASM: debug_mode = 0
+// ASM: enable_ieee_mode = 0
+// ASM: enable_sgpr_private_segment_wave_byte_offset = 0
+// ASM: user_sgpr_count = 2
+// ASM: enable_sgpr_workgroup_id_x = 0
+// ASM: enable_sgpr_workgroup_id_y = 0
+// ASM: enable_sgpr_workgroup_id_z = 0
+// ASM: enable_sgpr_workgroup_info = 0
+// ASM: enable_vgpr_workitem_id = 0
+// ASM: enable_exception_msb = 0
+// ASM: granulated_lds_size = 0
+// ASM: enable_exception = 0
+// ASM:	enable_sgpr_private_segment_buffer = 0
+// ASM:	enable_sgpr_dispatch_ptr = 0
+// ASM:	enable_sgpr_queue_ptr = 0
+// ASM:	enable_sgpr_kernarg_segment_ptr = 1
+// ASM:	enable_sgpr_dispatch_id = 0
+// ASM:	enable_sgpr_flat_scratch_init = 0
+// ASM:	enable_sgpr_private_segment_size = 0
+// ASM:	enable_sgpr_grid_workgroup_count_x = 0
+// ASM:	enable_sgpr_grid_workgroup_count_y = 0
+// ASM:	enable_sgpr_grid_workgroup_count_z = 0
+// ASM:	enable_ordered_append_gds = 0
+// ASM:	private_element_size = 0
+// ASM:	is_ptr64 = 1
+// ASM:	is_dynamic_callstack = 0
+// ASM:	is_debug_enabled = 0
+// ASM:	is_xnack_enabled = 0
+// ASM:	workitem_private_segment_byte_size = 0
+// ASM:	workgroup_group_segment_byte_size = 0
+// ASM:	gds_segment_byte_size = 0
+// ASM:	kernarg_segment_byte_size = 16
+// ASM:	workgroup_fbarrier_count = 0
+// ASM:	wavefront_sgpr_count = 8
+// ASM:	workitem_vgpr_count = 16
+// ASM:	reserved_vgpr_first = 0
+// ASM:	reserved_vgpr_count = 0
+// ASM:	reserved_sgpr_first = 0
+// ASM:	reserved_sgpr_count = 0
+// ASM:	debug_wavefront_private_segment_offset_sgpr = 0
+// ASM:	debug_private_segment_buffer_sgpr = 0
+// ASM:	kernarg_segment_alignment = 4
+// ASM:	group_segment_alignment = 4
+// ASM:	private_segment_alignment = 4
+// ASM:	wavefront_size = 6
+// ASM:	call_convention = -1
+// ASM:	runtime_loader_kernel_symbol = 0
+// ASM: .end_amd_kernel_code_t
Index: tools/llvm-readobj/ELFDumper.cpp
===================================================================
--- tools/llvm-readobj/ELFDumper.cpp
+++ tools/llvm-readobj/ELFDumper.cpp
@@ -28,7 +28,9 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFTypes.h"
@@ -3466,7 +3468,7 @@
   return OS.str();
 }
 
-static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
+static std::string getAMDNoteTypeName(const uint32_t NT) {
   static const struct {
     uint32_t ID;
     const char *Name;
@@ -3489,6 +3491,16 @@
   return OS.str();
 }
 
+static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
+  if (NT == ELF::NT_AMDGPU_METADATA)
+    return std::string("NT_AMDGPU_METADATA (AMDGPU Metadata)");
+
+  std::string string;
+  raw_string_ostream OS(string);
+  OS << format("Unknown note type (0x%08x)", NT);
+  return OS.str();
+}
+
 template <typename ELFT>
 static void printGNUProperty(raw_ostream &OS, uint32_t Type, uint32_t DataSize,
                              ArrayRef<uint8_t> Data) {
@@ -3609,8 +3621,8 @@
 }
 
 template <typename ELFT>
-static void printAMDGPUNote(raw_ostream &OS, uint32_t NoteType,
-                            ArrayRef<typename ELFT::Word> Words, size_t Size) {
+static void printAMDNote(raw_ostream &OS, uint32_t NoteType,
+                         ArrayRef<typename ELFT::Word> Words, size_t Size) {
   switch (NoteType) {
   default:
     return;
@@ -3640,6 +3652,32 @@
   OS.flush();
 }
 
+template <typename ELFT>
+static void printAMDGPUNote(raw_ostream &OS, uint32_t NoteType,
+                            ArrayRef<typename ELFT::Word> Words, size_t Size) {
+  OS << "    AMDGPU Metadata:\n";
+  auto MsgPackString = StringRef(reinterpret_cast<const char *>(Words.data()), Size);
+  msgpack::Reader MsgPackReader(MsgPackString);
+  auto OptMsgPackNodeOrErr = msgpack::Node::read(MsgPackReader);
+  if (auto Err = OptMsgPackNodeOrErr.takeError())
+    return error(std::move(Err));
+  auto &OptMsgPackNode = *OptMsgPackNodeOrErr;
+  if (!OptMsgPackNode)
+    return;
+  auto &MsgPackNode = *OptMsgPackNode;
+
+  AMDGPU::HSAMD::V3::MetadataVerifier Verifier(true);
+  if (!Verifier.verify(MsgPackNode))
+    reportError("invalid AMDGPU metadata");
+
+  std::string HSAMetadataString;
+  raw_string_ostream StrOS(HSAMetadataString);
+  yaml::Output YOut(StrOS);
+  YOut << MsgPackNode;
+  OS << StrOS.str();
+  OS.flush();
+}
+
 template <class ELFT>
 void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   const Elf_Ehdr *e = Obj->getHeader();
@@ -3666,6 +3704,9 @@
     } else if (Name == "FreeBSD") {
       OS << getFreeBSDNoteTypeName(Type) << '\n';
     } else if (Name == "AMD") {
+      OS << getAMDNoteTypeName(Type) << '\n';
+      printAMDNote<ELFT>(OS, Type, Descriptor, Descriptor.size());
+    } else if (Name == "AMDGPU") {
       OS << getAMDGPUNoteTypeName(Type) << '\n';
       printAMDGPUNote<ELFT>(OS, Type, Descriptor, Descriptor.size());
     } else {