diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -3172,6 +3172,10 @@ if a higher numbered register is used explicitly. + ".agpr_count" integer Required Number of accumulator + registers required by + each work-item for + GFX90A, GFX908. ".max_flat_workgroup_size" integer Required Maximum flat work-group size supported by the @@ -11292,6 +11296,7 @@ ".lds_size" integer Local Data Share size in bytes. ".perf_data_buffer_size" integer Performance data buffer size in bytes. ".vgpr_count" integer Number of VGPRs used. + ".agpr_count" integer Number of AGPRs used. ".sgpr_count" integer Number of SGPRs used. ".vgpr_limit" integer If non-zero, indicates the shader was compiled with a directive to instruct the compiler to limit the VGPR usage to diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -982,6 +982,13 @@ MD->setEntryPoint(CC, MF.getFunction().getName()); MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); + + // Only set AGPRs for supported devices + const GCNSubtarget &STM = MF.getSubtarget(); + if (STM.hasMAIInsts()) { + MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); + } + MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); if (AMDGPU::isCompute(CC)) { @@ -998,7 +1005,6 @@ MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } - const GCNSubtarget &STM = MF.getSubtarget(); if (STM.isWave32()) MD->setWave32(MF.getFunction().getCallingConv()); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -877,6 +877,12 @@ Kern.getDocument()->getNode(STM.getWavefrontSize()); Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR); Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR); + + // Only add AGPR count to metadata for supported devices + if (STM.hasMAIInsts()) { + Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR); + } + Kern[".max_flat_workgroup_size"] = Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize()); Kern[".sgpr_spill_count"] = diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1123,7 +1123,9 @@ class KernelScopeInfo { int SgprIndexUnusedMin = -1; int VgprIndexUnusedMin = -1; + int AgprIndexUnusedMin = -1; MCContext *Ctx = nullptr; + MCSubtargetInfo const *MSTI = nullptr; void usesSgprAt(int i) { if (i >= SgprIndexUnusedMin) { @@ -1140,24 +1142,63 @@ VgprIndexUnusedMin = ++i; if (Ctx) { MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count")); - Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx)); + Sym->setVariableValue(MCConstantExpr::create(getTotalNumVgprs(), *Ctx)); } } } + void usesAgprAt(int i) { + // Instruction will be flagged/error in AMDGPUAsmParser::MatchAndEmitInstruction + if (!hasMAIInsts(*MSTI)) + return; + + if (i >= AgprIndexUnusedMin) { + AgprIndexUnusedMin = ++i; + if (Ctx) { + MCSymbol *Sym = Ctx->getOrCreateSymbol(Twine(".kernel.agpr_count")); + Sym->setVariableValue(MCConstantExpr::create(AgprIndexUnusedMin, *Ctx)); + + // Also update vgpr_count, which is dependent on agpr_count (on gfx908, gfx90a) + MCSymbol *vSym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count")); + vSym->setVariableValue(MCConstantExpr::create(getTotalNumVgprs(), *Ctx)); + } + } + } + + // Sourced from AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs() + int getTotalNumVgprs() { + // Default + if (!hasMAIInsts(*MSTI)) + return VgprIndexUnusedMin; + // 908, vgpr_count == max(used_vgprs, used_agprs) + if (!isGFX90A(*MSTI)) + return std::max(VgprIndexUnusedMin, AgprIndexUnusedMin); + // 90a, vgpr_count == sum(used_vgprs, used_agprs) + if (isGFX90A(*MSTI)) + return alignTo(VgprIndexUnusedMin, 4) + AgprIndexUnusedMin; + // Fall through for other targets + else + return VgprIndexUnusedMin; + } + public: KernelScopeInfo() = default; void initialize(MCContext &Context) { Ctx = &Context; + MSTI = Ctx->getSubtargetInfo(); + usesSgprAt(SgprIndexUnusedMin = -1); usesVgprAt(VgprIndexUnusedMin = -1); + if (hasMAIInsts(*MSTI)) { + usesAgprAt(AgprIndexUnusedMin = -1); + } } void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) { switch (RegKind) { case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break; - case IS_AGPR: // fall through + case IS_AGPR: usesAgprAt(DwordRegIndex + RegWidth - 1); break; case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break; default: break; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -750,6 +750,7 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI); bool isGFX90A(const MCSubtargetInfo &STI); bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI); +bool hasMAIInsts(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1480,6 +1480,10 @@ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; } +bool hasMAIInsts(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts]; +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -69,6 +69,10 @@ // the shader stage to determine the number of vgprs to allocate. void setNumUsedVgprs(unsigned CC, unsigned Val); + // Set the number of used agprs in the metadata. This is an optional advisory + // record for logging etc; + void setNumUsedAgprs(unsigned CC, unsigned Val); + // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -209,6 +209,11 @@ getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val); } +// Set the number of used agprs in the metadata. +void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) { + getHwStage(CC)[".agpr_count"] = Val; +} + // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll @@ -0,0 +1,78 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s + +; COM: Adapted from agpr-register-count.ll +; COM: GFX900 and below should not have .agpr_count present in the metadata + + +; CHECK: .type kernel_32_agprs +; CHECK: NumAgprs: 32 +define amdgpu_kernel void @kernel_32_agprs() #0 { +bb: + call void asm sideeffect "", "~{v8}" () + call void asm sideeffect "", "~{a31}" () + ret void +} + +; CHECK: .type kernel_0_agprs +; CHECK: NumAgprs: 0 +define amdgpu_kernel void @kernel_0_agprs() #0 { +bb: + call void asm sideeffect "", "~{v0}" () + ret void +} + +; CHECK: .type kernel_40_vgprs +; CHECK: NumAgprs: 16 +define amdgpu_kernel void @kernel_40_vgprs() #0 { +bb: + call void asm sideeffect "", "~{v39}" () + call void asm sideeffect "", "~{a15}" () + ret void +} + +; CHECK: .type kernel_max_gprs +; CHECK: NumAgprs: 256 +define amdgpu_kernel void @kernel_max_gprs() #0 { +bb: + call void asm sideeffect "", "~{v255}" () + call void asm sideeffect "", "~{a255}" () + ret void +} + +; CHECK: .type func_32_agprs +; CHECK: NumAgprs: 32 +define void @func_32_agprs() #0 { +bb: + call void asm sideeffect "", "~{v8}" () + call void asm sideeffect "", "~{a31}" () + ret void +} + +; CHECK: .type kernel_call_func_32_agprs +; CHECK: NumAgprs: 32 +define amdgpu_kernel void @kernel_call_func_32_agprs() #0 { +bb: + call void @func_32_agprs() #0 + ret void +} + +declare void @undef_func() + +; CHECK: .type kernel_call_undef_func +; CHECK: NumAgprs: 32 +define amdgpu_kernel void @kernel_call_undef_func() #0 { +bb: + call void @undef_func() + ret void +} + +; CHECK: --- +; CHECK: amdpal.pipelines: +; GFX90A: agpr_count: 0x20 +; GFX90A: vgpr_count: 0x40 + +; GFX908: agpr_count: 0x20 +; GFX908: vgpr_count: 0x20 + +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll @@ -0,0 +1,101 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX90A %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX908 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX801 %s + +; COM: Adapted from agpr-register-count.ll +; COM: GFX900 and below should not have .agpr_count present in the metadata + +; CHECK: --- +; CHECK: amdhsa.kernels: + +; GFX90A: - .agpr_count: 32 +; GFX908: - .agpr_count: 32 +; GFX801-NOT: - .agpr_count: +; CHECK: .name: kernel_32_agprs +; GFX90A: .vgpr_count: 44 +; GFX908: .vgpr_count: 32 +; GFX801: .vgpr_count: 9 +define amdgpu_kernel void @kernel_32_agprs() #0 { +bb: + call void asm sideeffect "", "~{v8}" () + call void asm sideeffect "", "~{a31}" () + ret void +} + +; GFX90A: - .agpr_count: 0 +; GFX908: - .agpr_count: 0 +; GFX801-NOT: - .agpr_count: +; CHECK: .name: kernel_0_agprs +; GFX90A: .vgpr_count: 1 +; GFX908: .vgpr_count: 1 +; GFX801: .vgpr_count: 1 +define amdgpu_kernel void @kernel_0_agprs() #0 { +bb: + call void asm sideeffect "", "~{v0}" () + ret void +} + +; GFX90A: - .agpr_count: 16 +; GFX908: - .agpr_count: 16 +; GFX801-NOT: - .agpr_count: +; CHECK: .name: kernel_40_vgprs +; GFX90A: .vgpr_count: 56 +; GFX908: .vgpr_count: 40 +; GFX801: .vgpr_count: 40 +define amdgpu_kernel void @kernel_40_vgprs() #0 { +bb: + call void asm sideeffect "", "~{v39}" () + call void asm sideeffect "", "~{a15}" () + ret void +} + +; GFX90A: - .agpr_count: 256 +; GFX908: - .agpr_count: 256 +; GFX801-NOT: - .agpr_count: +; CHECK: .name: kernel_max_gprs +; GFX90A: .vgpr_count: 512 +; GFX908: .vgpr_count: 256 +; GFX801: .vgpr_count: 256 +define amdgpu_kernel void @kernel_max_gprs() #0 { +bb: + call void asm sideeffect "", "~{v255}" () + call void asm sideeffect "", "~{a255}" () + ret void +} + +define void @func_32_agprs() #0 { +bb: + call void asm sideeffect "", "~{v8}" () + call void asm sideeffect "", "~{a31}" () + ret void +} + +; GFX90A: - .agpr_count: 32 +; GFX908: - .agpr_count: 32 +; GFX801-NOT: - .agpr_count: +; CHECK: .name: kernel_call_func_32_agprs +; GFX90A: .vgpr_count: 44 +; GFX908: .vgpr_count: 32 +; GFX801: .vgpr_count: 9 +define amdgpu_kernel void @kernel_call_func_32_agprs() #0 { +bb: + call void @func_32_agprs() #0 + ret void +} + +declare void @undef_func() + +; GFX90A: - .agpr_count: 32 +; GFX908: - .agpr_count: 32 +; GFX801-NOT: - .agpr_count: +; CHECK: .name: kernel_call_undef_func +; GFX90A: .vgpr_count: 64 +; GFX908: .vgpr_count: 32 +; GFX801: .vgpr_count: 32 +define amdgpu_kernel void @kernel_call_undef_func() #0 { +bb: + call void @undef_func() + ret void +} + +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=CHECK,GFX801 %s + +; COM: Comments for each kernel +; CHECK: kernel_32_agprs +; GFX908: ; NumVgprs: 9 +; GFX908 ; NumAgprs: 32 +; GFX908 ; TotalNumVgprs: 32 + +; GFX90A: ; NumVgprs: 9 +; GFX90A ; NumAgprs: 32 +; GFX90A ; TotalNumVgprs: 44 + +; GFX801: ; NumVgprs: 9 + +; CHECK: kernel_40_vgprs +; GFX908: ; NumVgprs: 40 +; GFX908 ; NumAgprs: 16 +; GFX908 ; TotalNumVgprs: 40 + +; GFX90A: ; NumVgprs: 40 +; GFX90A ; NumAgprs: 16 +; GFX90A ; TotalNumVgprs: 56 + +; GFX801: ; NumVgprs: 40 + +; COM: Metadata +; GFX908: - .agpr_count: 32 +; GFX908: .vgpr_count: 32 + +; GFX90A: - .agpr_count: 32 +; GFX90A: .vgpr_count: 44 + +; GFX801: .vgpr_count: 9 +define amdgpu_kernel void @kernel_32_agprs() #0 { +bb: + call void asm sideeffect "", "~{v8}" () + call void asm sideeffect "", "~{a31}" () + ret void +} + +; GFX908: - .agpr_count: 16 +; GFX908: .vgpr_count: 40 + +; GFX90A: - .agpr_count: 16 +; GFX90A: .vgpr_count: 56 + +; GFX801: .vgpr_count: 40 +define amdgpu_kernel void @kernel_40_vgprs() #0 { +bb: + call void asm sideeffect "", "~{v39}" () + call void asm sideeffect "", "~{a15}" () + ret void +} + +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } diff --git a/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s b/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s @@ -0,0 +1,62 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefixes=GFX90A %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck -check-prefixes=GFX908 %s +// Based on sym_kernel_scope.s + +.byte .kernel.agpr_count +// CHECK: .byte 0 +.byte .kernel.vgpr_count +// CHECK: .byte 0 + + v_accvgpr_write_b32 a0, v6 + v_accvgpr_read_b32 v3, a3 + s_endpgm +.byte .kernel.agpr_count +// GFX90A: .byte 4 +// GFX908: .byte 4 +.byte .kernel.vgpr_count +// GFX90A: .byte 12 +// GFX908: .byte 7 + +.amdgpu_hsa_kernel K1 +K1: +.byte .kernel.agpr_count +// CHECK: .byte 0 +.byte .kernel.vgpr_count +// CHECK: .byte 0 + v_accvgpr_write_b32 a44, v6 + s_endpgm +.byte .kernel.agpr_count +// GFX90A: .byte 45 +// GFX908: .byte 45 +.byte .kernel.vgpr_count +// GFX90A: .byte 53 +// GFX908: .byte 45 + +.amdgpu_hsa_kernel K2 +.byte .kernel.agpr_count +// CHECK: .byte 0 +.byte .kernel.vgpr_count +// CHECK: .byte 0 +K2: + v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3 + s_endpgm +.byte .kernel.agpr_count +// GFX90A: .byte 4 +// GFX908: .byte 4 +.byte .kernel.vgpr_count +// GFX90A: .byte 8 +// GFX908: .byte 4 + +.text +.amdgpu_hsa_kernel K3 +K3: + v_accvgpr_read_b32 v[0], a0 + v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] cbsz:1 abid:2 blgp:3 + s_endpgm + +.byte .kernel.agpr_count +// GFX90A: .byte 16 +// GFX908: .byte 16 +.byte .kernel.vgpr_count +// GFX90A: .byte 20 +// GFX908: .byte 16