diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -441,7 +441,7 @@
 
   assert(isUInt<32>(PI.ScratchSize));
   assert(isUInt<32>(PI.getComputePGMRSrc1()));
-  assert(isUInt<32>(PI.ComputePGMRSrc2));
+  assert(isUInt<32>(PI.getComputePGMRSrc2()));
 
   KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
@@ -450,7 +450,7 @@
   KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
 
   KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
-  KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
+  KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
 
   assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
@@ -579,28 +579,27 @@
     OutStreamer->emitRawComment(
       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
 
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
-      Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:USER_SGPR: " +
-      Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
-      Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
-      Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
-      Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
-      Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
-      Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
-      false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+                                    Twine(CurrentProgramInfo.ScratchEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+                                    Twine(CurrentProgramInfo.UserSGPR),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+                                    Twine(CurrentProgramInfo.TrapHandlerEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+                                    Twine(CurrentProgramInfo.TGIdXEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+                                    Twine(CurrentProgramInfo.TGIdYEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+                                    Twine(CurrentProgramInfo.TGIdZEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+                                    Twine(CurrentProgramInfo.TIdIGCompCount),
+                                false);
 
     assert(STM.hasGFX90AInsts() ||
            CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
@@ -922,22 +921,21 @@
   // anything to disable it if we know the stack isn't used here. We may still
   // have emitted code reading it to initialize scratch, but if that's unused
   // reading garbage should be OK.
-  const bool EnablePrivateSegment =
+  ProgInfo.ScratchEnable =
       ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
-  ProgInfo.ComputePGMRSrc2 =
-      S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
-      S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
-      // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
-      S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
-      S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
-      S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
-      S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
-      S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
-      S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
-      S_00B84C_EXCP_EN_MSB(0) |
-      // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
-      S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
-      S_00B84C_EXCP_EN(0);
+  ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
+  // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
+  ProgInfo.TrapHandlerEnable =
+      STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
+  ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
+  ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
+  ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
+  ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
+  ProgInfo.TIdIGCompCount = TIDIGCompCnt;
+  ProgInfo.EXCPEnMSB = 0;
+  // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
+  ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
+  ProgInfo.EXCPEnable = 0;
 
   if (STM.hasGFX90AInsts()) {
     AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
@@ -978,7 +976,7 @@
     OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
 
     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
-    OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
+    OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
 
     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
     OutStreamer->emitInt32(
@@ -1038,25 +1036,87 @@
   }
 
   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
-  MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
-  if (AMDGPU::isCompute(CC)) {
-    MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
+  if (MD->getPALMajorVersion() < 3) {
+    MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
+    if (AMDGPU::isCompute(CC)) {
+      MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
+    } else {
+      if (CurrentProgramInfo.ScratchBlocks > 0)
+        MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
+    }
   } else {
-    if (CurrentProgramInfo.ScratchBlocks > 0)
-      MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
+    // Priority?
+    MD->setHwStage(CC, ".float_mode", CurrentProgramInfo.FloatMode);
+    // Priv?
+    // DX10Clamp?
+    MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
+    MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
+    MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
+    MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
+
+    if (AMDGPU::isCompute(CC)) {
+      MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+      MD->setHwStage(CC, ".trap_present",
+                     (bool)CurrentProgramInfo.TrapHandlerEnable);
+
+      // Compute registers
+      // If the front-end has set tgid_x/y/z_en - assert that the
+      // CurrentProgramInfo is consistent (usually set with function attributes
+      // amdgpu-no-workgroup-id-x etc.).
+      assert(MD->checkComputeRegisters(".tgid_x_en",
+                                       (bool)CurrentProgramInfo.TGIdXEnable));
+      assert(MD->checkComputeRegisters(".tgid_y_en",
+                                       (bool)CurrentProgramInfo.TGIdYEnable));
+      assert(MD->checkComputeRegisters(".tgid_z_en",
+                                       (bool)CurrentProgramInfo.TGIdZEnable));
+
+      // EXCPEnMSB?
+      const unsigned LdsDwGranularity = 128;
+      MD->setHwStage(CC, ".lds_size",
+                     (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
+                                sizeof(uint32_t)));
+      MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
+    } else {
+      MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+    }
   }
+
   // ScratchSize is in bytes, 16 aligned.
   MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
                                 : CurrentProgramInfo.LDSBlocks;
-    MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
-    MD->setSpiPsInputEna(MFI->getPSInputEnable());
-    MD->setSpiPsInputAddr(MFI->getPSInputAddr());
+    if (MD->getPALMajorVersion() < 3) {
+      MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
+      MD->setSpiPsInputEna(MFI->getPSInputEnable());
+      MD->setSpiPsInputAddr(MFI->getPSInputAddr());
+    } else {
+      // Graphics registers
+      MD->setGraphicsRegisters(".ps_extra_lds_size", ExtraLDSSize);
+      // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
+      static StringLiteral const PsInputFields[] = {
+          ".persp_sample_ena",    ".persp_center_ena",
+          ".persp_centroid_ena",  ".persp_pull_model_ena",
+          ".linear_sample_ena",   ".linear_center_ena",
+          ".linear_centroid_ena", ".line_stipple_tex_ena",
+          ".pos_x_float_ena",     ".pos_y_float_ena",
+          ".pos_z_float_ena",     ".pos_w_float_ena",
+          ".front_face_ena",      ".ancillary_ena",
+          ".sample_coverage_ena", ".pos_fixed_pt_ena"};
+      unsigned PSInputEna = MFI->getPSInputEnable();
+      unsigned PSInputAddr = MFI->getPSInputAddr();
+      for (auto [Idx, Field] : enumerate(PsInputFields)) {
+        MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
+                                 (bool)((PSInputEna >> Idx) & 1));
+        MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
+                                 (bool)((PSInputAddr >> Idx) & 1));
+      }
+    }
   }
 
-  if (STM.isWave32())
+  // For version 3 and above the wave front size is already set in the metadata
+  if (MD->getPALMajorVersion() < 3 && STM.isWave32())
     MD->setWave32(MF.getFunction().getCallingConv());
 }
 
@@ -1068,7 +1128,7 @@
   // Set compute registers
   MD->setRsrc1(CallingConv::AMDGPU_CS,
                CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
-  MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
+  MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
 
   // Set optional info
   MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
@@ -1104,7 +1164,7 @@
 
   Out.compute_pgm_resource_registers =
       CurrentProgramInfo.getComputePGMRSrc1() |
-      (CurrentProgramInfo.ComputePGMRSrc2 << 32);
+      (CurrentProgramInfo.getComputePGMRSrc2() << 32);
   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
 
   if (CurrentProgramInfo.DynamicCallStack)
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -36,11 +36,23 @@
     uint32_t MemOrdered = 0; // GFX10+
     uint64_t ScratchSize = 0;
 
-    // Fields set in PGM_RSRC2 pm4 packet.
+    // State used to calculate fields set in PGM_RSRC2 pm4 packet.
     uint32_t LDSBlocks = 0;
     uint32_t ScratchBlocks = 0;
 
-    uint64_t ComputePGMRSrc2 = 0;
+    // Fields set in PGM_RSRC2 pm4 packet
+    uint32_t ScratchEnable = 0;
+    uint32_t UserSGPR = 0;
+    uint32_t TrapHandlerEnable = 0;
+    uint32_t TGIdXEnable = 0;
+    uint32_t TGIdYEnable = 0;
+    uint32_t TGIdZEnable = 0;
+    uint32_t TGSizeEnable = 0;
+    uint32_t TIdIGCompCount = 0;
+    uint32_t EXCPEnMSB = 0;
+    uint32_t LdsSize = 0;
+    uint32_t EXCPEnable = 0;
+
     uint64_t ComputePGMRSrc3GFX90A = 0;
 
     uint32_t NumVGPR = 0;
@@ -75,6 +87,10 @@
     /// Compute the value of the ComputePGMRsrc1 register.
     uint64_t getComputePGMRSrc1() const;
     uint64_t getPGMRSrc1(CallingConv::ID CC) const;
+
+    /// Compute the value of the ComputePGMRsrc2 register.
+    uint64_t getComputePGMRSrc2() const;
+    uint64_t getPGMRSrc2(CallingConv::ID CC) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -54,3 +54,23 @@
   }
   return Reg;
 }
+
+uint64_t SIProgramInfo::getComputePGMRSrc2() const {
+  uint64_t Reg =
+      S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) |
+      S_00B84C_TRAP_HANDLER(TrapHandlerEnable) |
+      S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) |
+      S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) |
+      S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) |
+      S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) |
+      S_00B84C_EXCP_EN(EXCPEnable);
+
+  return Reg;
+}
+
+uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
+  if (AMDGPU::isCompute(CC))
+    return getComputePGMRSrc2();
+
+  return 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -27,6 +27,11 @@
   msgpack::DocNode Registers;
   msgpack::DocNode HwStages;
   msgpack::DocNode ShaderFunctions;
+  bool VersionChecked = false;
+  msgpack::DocNode Version;
+  // From PAL version >= 3.0
+  msgpack::DocNode ComputeRegisters;
+  msgpack::DocNode GraphicsRegisters;
 
 public:
   // Read the amdgpu.pal.metadata supplied by the frontend, ready for
@@ -129,6 +134,26 @@
   // Set legacy PAL metadata format.
   void setLegacy();
 
+  unsigned getPALMajorVersion();
+  unsigned getPALMinorVersion();
+
+  void setHwStage(unsigned CC, StringRef field, unsigned Val);
+  void setHwStage(unsigned CC, StringRef field, bool Val);
+
+  void setComputeRegisters(StringRef field, unsigned Val);
+  void setComputeRegisters(StringRef field, bool Val);
+
+  // If the field does not exist will return nullptr rather than creating a new
+  // entry (which is the behaviour of the other functions).
+  msgpack::DocNode *refComputeRegister(StringRef field);
+  bool checkComputeRegisters(StringRef field, unsigned Val);
+  bool checkComputeRegisters(StringRef field, bool Val);
+
+  void setGraphicsRegisters(StringRef field, unsigned Val);
+  void setGraphicsRegisters(StringRef field, bool Val);
+  void setGraphicsRegisters(StringRef field1, StringRef field2, unsigned Val);
+  void setGraphicsRegisters(StringRef field1, StringRef field2, bool Val);
+
   // Erase all PAL metadata.
   void reset();
 
@@ -151,10 +176,29 @@
   // Get (create if necessary) a function in the shader functions map.
   msgpack::MapDocNode getShaderFunction(StringRef Name);
 
+  // Reference (create if necessary) the node for the compute_registers map.
+  msgpack::DocNode &refComputeRegisters();
+
+  // Get (create if necessary) the .compute_registers entry.
+  msgpack::MapDocNode getComputeRegisters();
+
+  // Reference (create if necessary) the node for the graphics registers map.
+  msgpack::DocNode &refGraphicsRegisters();
+
+  // Get (create if necessary) the .graphics_registers entry.
+  msgpack::MapDocNode getGraphicsRegisters();
+
+  // Reference (create if necessary) the node for the hardware_stages map.
+  msgpack::DocNode &refHwStage();
+
   // Get (create if necessary) the .hardware_stages entry for the given calling
   // convention.
   msgpack::MapDocNode getHwStage(unsigned CC);
 
+  // Get the PAL version major (idx 0) or minor (idx 1). This is an internal
+  // helper for the public wrapper functions that request Major or Minor
+  unsigned getPALVersion(unsigned idx);
+
   bool setFromLegacyBlob(StringRef Blob);
   bool setFromMsgPackBlob(StringRef Blob);
   void toLegacyBlob(std::string &Blob);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -811,6 +811,38 @@
   return Functions[Name].getMap(/*Convert=*/true);
 }
 
+msgpack::DocNode &AMDGPUPALMetadata::refComputeRegisters() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".compute_registers")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
+msgpack::MapDocNode AMDGPUPALMetadata::getComputeRegisters() {
+  if (ComputeRegisters.isEmpty())
+    ComputeRegisters = refComputeRegisters();
+  return ComputeRegisters.getMap();
+}
+
+msgpack::DocNode &AMDGPUPALMetadata::refGraphicsRegisters() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".graphics_registers")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
+msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() {
+  if (GraphicsRegisters.isEmpty())
+    GraphicsRegisters = refGraphicsRegisters();
+  return GraphicsRegisters.getMap();
+}
+
 // Return the PAL metadata hardware shader stage name.
 static const char *getStageName(CallingConv::ID CC) {
   switch (CC) {
@@ -833,15 +865,21 @@
   }
 }
 
+msgpack::DocNode &AMDGPUPALMetadata::refHwStage() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".hardware_stages")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
 // Get (create if necessary) the .hardware_stages entry for the given calling
 // convention.
 msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) {
   if (HwStages.isEmpty())
-    HwStages = MsgPackDoc.getRoot()
-                   .getMap(/*Convert=*/true)["amdpal.pipelines"]
-                   .getArray(/*Convert=*/true)[0]
-                   .getMap(/*Convert=*/true)[".hardware_stages"]
-                   .getMap(/*Convert=*/true);
+    HwStages = refHwStage();
   return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true);
 }
 
@@ -874,3 +912,78 @@
   Registers = MsgPackDoc.getEmptyNode();
   HwStages = MsgPackDoc.getEmptyNode();
 }
+
+unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) {
+  assert(idx < 2 &&
+         "illegal index to PAL version - should be 0 (major) or 1 (minor)");
+  if (!VersionChecked) {
+    if (Version.isEmpty()) {
+      auto &M = MsgPackDoc.getRoot().getMap(/*Convert=*/true);
+      auto I = M.find(MsgPackDoc.getNode("amdpal.version"));
+      if (I != M.end())
+        Version = I->second;
+    }
+    VersionChecked = true;
+  }
+  if (Version.isEmpty())
+    // Default to 2.6 if there's no version info
+    return idx ? 6 : 2;
+  return Version.getArray()[idx].getUInt();
+}
+
+unsigned AMDGPUPALMetadata::getPALMajorVersion() { return getPALVersion(0); }
+
+unsigned AMDGPUPALMetadata::getPALMinorVersion() { return getPALVersion(1); }
+
+// Set the field in a given .hardware_stages entry
+void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, unsigned Val) {
+  getHwStage(CC)[field] = Val;
+}
+
+void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) {
+  getHwStage(CC)[field] = Val;
+}
+
+void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) {
+  getComputeRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setComputeRegisters(StringRef field, bool Val) {
+  getComputeRegisters()[field] = Val;
+}
+
+msgpack::DocNode *AMDGPUPALMetadata::refComputeRegister(StringRef field) {
+  auto M = getComputeRegisters();
+  auto I = M.find(field);
+  return I == M.end() ? nullptr : &I->second;
+}
+
+bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, unsigned Val) {
+  if (auto N = refComputeRegister(field))
+    return N->getUInt() == Val;
+  return false;
+}
+
+bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, bool Val) {
+  if (auto N = refComputeRegister(field))
+    return N->getBool() == Val;
+  return false;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, unsigned Val) {
+  getGraphicsRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, bool Val) {
+  getGraphicsRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2,
+                                             unsigned Val) {
+  getGraphicsRegisters()[field1].getMap(true)[field2] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2,
+                                             bool Val) {
+  getGraphicsRegisters()[field1].getMap(true)[field2] = Val;
+}
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -0,0 +1,165 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s
+
+; CHECK-LABEL: {{^}}_amdgpu_cs_main:
+; CHECK: ; NumSgprs: 4
+; CHECK: ; NumVgprs: 2
+; CHECK:           .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:   - .api:            Vulkan
+; CHECK-NEXT:     .compute_registers:
+; CHECK-NEXT:       .tg_size_en:     true
+; CHECK-NEXT:       .tgid_x_en:      false
+; CHECK-NEXT:       .tgid_y_en:      false
+; CHECK-NEXT:       .tgid_z_en:      false
+; CHECK-NEXT:       .tidig_comp_cnt: 0x1
+; CHECK-NEXT:     .graphics_registers:
+; CHECK-NEXT:      .ps_extra_lds_size: 0
+; CHECK-NEXT:      .spi_ps_input_addr:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: true
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: true
+; CHECK-NEXT:        .linear_centroid_ena: true
+; CHECK-NEXT:        .linear_sample_ena: true
+; CHECK-NEXT:        .persp_center_ena: true
+; CHECK-NEXT:        .persp_centroid_ena: true
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: true
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:      .spi_ps_input_ena:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: false
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: false
+; CHECK-NEXT:        .linear_centroid_ena: false
+; CHECK-NEXT:        .linear_sample_ena: false
+; CHECK-NEXT:        .persp_center_ena: false
+; CHECK-NEXT:        .persp_centroid_ena: false
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: false
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:    .hardware_stages:
+; CHECK-NEXT:      .cs:
+; CHECK-NEXT:        .checksum_value: 0x9444d7d0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_cs_main
+; CHECK-NEXT:        .excp_en:        0
+; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .image_op:       false
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x4
+; CHECK-NEXT:        .sgpr_limit:     0x6a
+; CHECK-NEXT:        .threadgroup_dimensions:
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:          - 0x400
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:        .trap_present:   false
+; CHECK-NEXT:        .user_data_reg_map:
+; CHECK-NEXT:          - 0x10000000
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:        .user_sgprs:     0x3
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:        .vgpr_limit:     0x100
+; CHECK-NEXT:        .wavefront_size: 0x40
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .ps:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    ps_shader
+; CHECK-NEXT:        .float_mode:     0xf0
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK:    .registers:      {}
+; CHECK:amdpal.version:
+; CHECK-NEXT:  - 0x3
+; CHECK-NEXT:  - 0
+; CHECK-NEXT:...
+; CHECK-NEXT:        .end_amdgpu_pal_metadata
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
+.entry:
+  %i = call i64 @llvm.amdgcn.s.getpc()
+  %i1 = and i64 %i, -4294967296
+  %i2 = zext i32 %arg1 to i64
+  %i3 = or i64 %i1, %i2
+  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+  %i5 = and i32 %arg2, 1023
+  %i6 = lshr i32 %arg2, 10
+  %i7 = and i32 %i6, 1023
+  %i8 = add nuw nsw i32 %i7, %i5
+  %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+  %.idx = shl nuw nsw i32 %i8, 2
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
+  ret void
+}
+
+define dllexport amdgpu_ps void @ps_shader() #1 {
+  ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc() #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" }
+
+attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}