diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -3172,6 +3172,10 @@
                                                                   if a higher numbered
                                                                   register is used
                                                                   explicitly.
+     ".agpr_count"                       integer        Required  Number of accumulator
+                                                                  registers required by
+                                                                  each work-item for
+                                                                  GFX90A, GFX908.
      ".max_flat_workgroup_size"          integer        Required  Maximum flat
                                                                   work-group size
                                                                   supported by the
@@ -11292,6 +11296,7 @@
      ".lds_size"                integer                  Local Data Share size in bytes.
      ".perf_data_buffer_size"   integer                  Performance data buffer size in bytes.
      ".vgpr_count"              integer                  Number of VGPRs used.
+     ".agpr_count"              integer                  Number of AGPRs used.
      ".sgpr_count"              integer                  Number of SGPRs used.
      ".vgpr_limit"              integer                  If non-zero, indicates the shader was compiled with a
                                                          directive to instruct the compiler to limit the VGPR usage to
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -982,6 +982,13 @@
 
   MD->setEntryPoint(CC, MF.getFunction().getName());
   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+
+  // Only set AGPRs for supported devices
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  if (STM.hasMAIInsts()) {
+    MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+  }
+
   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
   MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
   if (AMDGPU::isCompute(CC)) {
@@ -998,7 +1005,6 @@
     MD->setSpiPsInputAddr(MFI->getPSInputAddr());
   }
 
-  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   if (STM.isWave32())
     MD->setWave32(MF.getFunction().getCallingConv());
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -877,6 +877,12 @@
       Kern.getDocument()->getNode(STM.getWavefrontSize());
   Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
   Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+
+  // Only add AGPR count to metadata for supported devices
+  if (STM.hasMAIInsts()) {
+    Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+  }
+
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
   Kern[".sgpr_spill_count"] =
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1123,7 +1123,9 @@
 class KernelScopeInfo {
   int SgprIndexUnusedMin = -1;
   int VgprIndexUnusedMin = -1;
+  int AgprIndexUnusedMin = -1;
   MCContext *Ctx = nullptr;
+  MCSubtargetInfo const *MSTI = nullptr;
 
   void usesSgprAt(int i) {
     if (i >= SgprIndexUnusedMin) {
@@ -1140,24 +1142,63 @@
       VgprIndexUnusedMin = ++i;
       if (Ctx) {
         MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
-        Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+        Sym->setVariableValue(MCConstantExpr::create(getTotalNumVgprs(), *Ctx));
       }
     }
   }
 
+  void usesAgprAt(int i) {
+    // Instruction will be flagged/error in AMDGPUAsmParser::MatchAndEmitInstruction
+    if (!hasMAIInsts(*MSTI))
+      return;
+
+    if (i >= AgprIndexUnusedMin) {
+      AgprIndexUnusedMin = ++i;
+      if (Ctx) {
+        MCSymbol *Sym = Ctx->getOrCreateSymbol(Twine(".kernel.agpr_count"));
+        Sym->setVariableValue(MCConstantExpr::create(AgprIndexUnusedMin, *Ctx));
+
+        // Also update vgpr_count, which is dependent on agpr_count (on gfx908, gfx90a)
+        MCSymbol *vSym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+        vSym->setVariableValue(MCConstantExpr::create(getTotalNumVgprs(), *Ctx));
+      }
+    }
+  }
+
+  // Sourced from AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs()
+  int getTotalNumVgprs() {
+    // Default
+    if (!hasMAIInsts(*MSTI))
+      return VgprIndexUnusedMin;
+    // 908, vgpr_count == max(used_vgprs, used_agprs)
+    if (!isGFX90A(*MSTI))
+      return std::max(VgprIndexUnusedMin, AgprIndexUnusedMin);
+    // 90a, vgpr_count == sum(used_vgprs, used_agprs)
+    if (isGFX90A(*MSTI))
+      return alignTo(VgprIndexUnusedMin, 4) + AgprIndexUnusedMin;
+    // Fall through for other targets
+    else
+      return VgprIndexUnusedMin;
+  }
+
 public:
   KernelScopeInfo() = default;
 
   void initialize(MCContext &Context) {
     Ctx = &Context;
+    MSTI = Ctx->getSubtargetInfo();
+
     usesSgprAt(SgprIndexUnusedMin = -1);
     usesVgprAt(VgprIndexUnusedMin = -1);
+    if (hasMAIInsts(*MSTI)) {
+      usesAgprAt(AgprIndexUnusedMin = -1);
+    }
   }
 
   void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
     switch (RegKind) {
       case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
-      case IS_AGPR: // fall through
+      case IS_AGPR: usesAgprAt(DwordRegIndex + RegWidth - 1); break;
       case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
       default: break;
     }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -750,6 +750,7 @@
 bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
 bool isGFX90A(const MCSubtargetInfo &STI);
 bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
+bool hasMAIInsts(const MCSubtargetInfo &STI);
 
 /// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1480,6 +1480,10 @@
   return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
 }
 
+bool hasMAIInsts(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts];
+}
+
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
   const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
   const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -69,6 +69,10 @@
   // the shader stage to determine the number of vgprs to allocate.
   void setNumUsedVgprs(unsigned CC, unsigned Val);
 
+  // Set the number of used agprs in the metadata. This is an optional advisory
+  // record for logging etc;
+  void setNumUsedAgprs(unsigned CC, unsigned Val);
+
   // Set the number of used sgprs in the metadata. This is an optional advisory
   // record for logging etc; wave dispatch actually uses the rsrc1 register for
   // the shader stage to determine the number of sgprs to allocate.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -209,6 +209,11 @@
   getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
 }
 
+// Set the number of used agprs in the metadata.
+void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) {
+  getHwStage(CC)[".agpr_count"] = Val;
+}
+
 // Set the number of used sgprs in the metadata. This is an optional advisory
 // record for logging etc; wave dispatch actually uses the rsrc1 register for
 // the shader stage to determine the number of sgprs to allocate.
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -0,0 +1,78 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+
+; COM: Adapted from agpr-register-count.ll
+; COM: GFX900 and below should not have .agpr_count present in the metadata
+
+
+; CHECK:      .type          kernel_32_agprs
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_0_agprs
+; CHECK:      NumAgprs:       0
+define amdgpu_kernel void @kernel_0_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v0}" ()
+  ret void
+}
+
+; CHECK:      .type           kernel_40_vgprs
+; CHECK:      NumAgprs:       16
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_max_gprs
+; CHECK:      NumAgprs:       256
+define amdgpu_kernel void @kernel_max_gprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v255}" ()
+  call void asm sideeffect "", "~{a255}" ()
+  ret void
+}
+
+; CHECK:      .type          func_32_agprs
+; CHECK:      NumAgprs:       32
+define void @func_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_call_func_32_agprs
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
+bb:
+  call void @func_32_agprs() #0
+  ret void
+}
+
+declare void @undef_func()
+
+; CHECK:      .type          kernel_call_undef_func
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
+bb:
+  call void @undef_func()
+  ret void
+}
+
+; CHECK: ---
+; CHECK:  amdpal.pipelines:
+; GFX90A: agpr_count:  0x20
+; GFX90A: vgpr_count:  0x40
+
+; GFX908: agpr_count:  0x20
+; GFX908: vgpr_count:  0x20
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+
+; COM: Adapted from agpr-register-count.ll
+; COM: GFX900 and below should not have .agpr_count present in the metadata
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_32_agprs
+; GFX90A:      .vgpr_count:    44
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    0
+; GFX908:    - .agpr_count:    0
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_0_agprs
+; GFX90A:      .vgpr_count:    1
+; GFX908:      .vgpr_count:    1
+; GFX801:      .vgpr_count:    1
+define amdgpu_kernel void @kernel_0_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v0}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    16
+; GFX908:    - .agpr_count:    16
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_40_vgprs
+; GFX90A:      .vgpr_count:    56
+; GFX908:      .vgpr_count:    40
+; GFX801:      .vgpr_count:    40
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    256
+; GFX908:    - .agpr_count:    256
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_max_gprs
+; GFX90A:      .vgpr_count:    512
+; GFX908:      .vgpr_count:    256
+; GFX801:      .vgpr_count:    256
+define amdgpu_kernel void @kernel_max_gprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v255}" ()
+  call void asm sideeffect "", "~{a255}" ()
+  ret void
+}
+
+define void @func_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_call_func_32_agprs
+; GFX90A:      .vgpr_count:    44
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
+bb:
+  call void @func_32_agprs() #0
+  ret void
+}
+
+declare void @undef_func()
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_call_undef_func
+; GFX90A:      .vgpr_count:    64
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    32
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
+bb:
+  call void @undef_func()
+  ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+
+; COM: Comments for each kernel
+; CHECK: kernel_32_agprs
+; GFX908:   ; NumVgprs: 9
+; GFX908    ; NumAgprs: 32
+; GFX908    ; TotalNumVgprs: 32
+
+; GFX90A:   ; NumVgprs: 9
+; GFX90A    ; NumAgprs: 32
+; GFX90A    ; TotalNumVgprs: 44
+
+; GFX801:   ; NumVgprs: 9
+
+; CHECK: kernel_40_vgprs
+; GFX908:   ; NumVgprs: 40
+; GFX908    ; NumAgprs: 16
+; GFX908    ; TotalNumVgprs: 40
+
+; GFX90A:   ; NumVgprs: 40
+; GFX90A    ; NumAgprs: 16
+; GFX90A    ; TotalNumVgprs: 56
+
+; GFX801:   ; NumVgprs: 40
+
+; COM: Metadata
+; GFX908:    - .agpr_count:    32
+; GFX908:      .vgpr_count:    32
+
+; GFX90A:    - .agpr_count:    32
+; GFX90A:      .vgpr_count:    44
+
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX908:    - .agpr_count:    16
+; GFX908:      .vgpr_count:    40
+
+; GFX90A:    - .agpr_count:    16
+; GFX90A:      .vgpr_count:    56
+
+; GFX801:      .vgpr_count:    40
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
diff --git a/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s b/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s
@@ -0,0 +1,62 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck -check-prefixes=GFX908 %s
+// Based on sym_kernel_scope.s
+
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+
+    v_accvgpr_write_b32 a0, v6
+    v_accvgpr_read_b32 v3, a3
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 4
+// GFX908: .byte 4
+.byte .kernel.vgpr_count
+// GFX90A: .byte 12
+// GFX908: .byte 7
+
+.amdgpu_hsa_kernel K1
+K1:
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+    v_accvgpr_write_b32 a44, v6
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 45
+// GFX908: .byte 45
+.byte .kernel.vgpr_count
+// GFX90A: .byte 53
+// GFX908: .byte 45
+
+.amdgpu_hsa_kernel K2
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+K2:
+    v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 4
+// GFX908: .byte 4
+.byte .kernel.vgpr_count
+// GFX90A: .byte 8
+// GFX908: .byte 4
+
+.text
+.amdgpu_hsa_kernel K3
+K3:
+    v_accvgpr_read_b32 v[0], a0
+    v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] cbsz:1 abid:2 blgp:3
+    s_endpgm
+
+.byte .kernel.agpr_count
+// GFX90A: .byte 16
+// GFX908: .byte 16
+.byte .kernel.vgpr_count
+// GFX90A: .byte 20
+// GFX908: .byte 16