Index: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -43,6 +43,7 @@ // Track the number of explicitly used VGPRs. Special registers reserved at // the end are tracked separately. int32_t NumVGPR = 0; + int32_t NumAGPR = 0; int32_t NumExplicitSGPR = 0; uint64_t PrivateSegmentSize = 0; bool UsesVCC = false; @@ -51,6 +52,7 @@ bool HasRecursion = false; int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; + int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; }; SIProgramInfo CurrentProgramInfo; @@ -77,6 +79,8 @@ void EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &KernelInfo); void emitCommonFunctionComments(uint32_t NumVGPR, + Optional NumAGPR, + uint32_t TotalNumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, Index: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -342,6 +342,8 @@ // Print comments that apply to both callable functions and entry points. void AMDGPUAsmPrinter::emitCommonFunctionComments( uint32_t NumVGPR, + Optional NumAGPR, + uint32_t TotalNumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, @@ -349,6 +351,11 @@ OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); + if (NumAGPR) { + OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); + OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), + false); + } OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), false); @@ -474,6 +481,8 @@ SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; emitCommonFunctionComments( Info.NumVGPR, + STM.hasMAIInsts() ? Info.NumAGPR : Optional(), + Info.getTotalNumVGPRs(STM), Info.getTotalNumSGPRs(MF.getSubtarget()), Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI); @@ -481,7 +490,11 @@ } OutStreamer->emitRawComment(" Kernel info:", false); - emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, + emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR, + STM.hasMAIInsts() + ? CurrentProgramInfo.NumAccVGPR + : Optional(), + CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); @@ -592,6 +605,11 @@ UsesVCC, UsesFlatScratch); } +int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( + const GCNSubtarget &ST) const { + return std::max(NumVGPR, NumAGPR); +} + AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineFunction &MF) const { SIFunctionResourceInfo Info; @@ -638,11 +656,18 @@ HighestVGPRReg = Reg; break; } - MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg); - if (MRI.isPhysRegUsed(AReg)) { - HighestVGPRReg = AReg; - break; + } + + if (ST.hasMAIInsts()) { + MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestAGPRReg = Reg; + break; + } } + Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestAGPRReg) + 1; } MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; @@ -664,6 +689,7 @@ } int32_t MaxVGPR = -1; + int32_t MaxAGPR = -1; int32_t MaxSGPR = -1; uint64_t CalleeFrameSize = 0; @@ -673,6 +699,7 @@ for (const MachineOperand &MO : MI.operands()) { unsigned Width = 0; bool IsSGPR = false; + bool IsAGPR = false; if (!MO.isReg()) continue; @@ -748,6 +775,7 @@ Width = 1; } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 1; } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && @@ -759,6 +787,7 @@ Width = 2; } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 2; } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { IsSGPR = false; @@ -775,6 +804,7 @@ Width = 4; } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 4; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && @@ -794,6 +824,7 @@ Width = 16; } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 16; } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { IsSGPR = true; @@ -803,6 +834,7 @@ Width = 32; } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 32; } else { llvm_unreachable("Unknown register class"); @@ -811,6 +843,8 @@ int MaxUsed = HWReg + Width - 1; if (IsSGPR) { MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else if (IsAGPR) { + MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; } else { MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; } @@ -832,6 +866,7 @@ 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); MaxVGPR = std::max(MaxVGPR, 23); + MaxAGPR = std::max(MaxAGPR, 23); CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384)); Info.UsesVCC = true; @@ -856,6 +891,7 @@ MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); CalleeFrameSize = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); Info.UsesVCC |= I->second.UsesVCC; @@ -872,6 +908,7 @@ Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; + Info.NumAGPR = MaxAGPR + 1; Info.PrivateSegmentSize += CalleeFrameSize; return Info; @@ -880,8 +917,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) { SIFunctionResourceInfo Info = analyzeResourceUsage(MF); + const GCNSubtarget &STM = MF.getSubtarget(); - ProgInfo.NumVGPR = Info.NumVGPR; + ProgInfo.NumArchVGPR = Info.NumVGPR; + ProgInfo.NumAccVGPR = Info.NumAGPR; + ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); ProgInfo.NumSGPR = Info.NumExplicitSGPR; ProgInfo.ScratchSize = Info.PrivateSegmentSize; ProgInfo.VCCUsed = Info.UsesVCC; @@ -894,7 +934,6 @@ MF.getFunction().getContext().diagnose(DiagStackSize); } - const GCNSubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are Index: llvm/lib/Target/AMDGPU/SIProgramInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -41,6 +41,8 @@ uint64_t ComputePGMRSrc2 = 0; uint32_t NumVGPR = 0; + uint32_t NumArchVGPR = 0; + uint32_t NumAccVGPR = 0; uint32_t NumSGPR = 0; uint32_t LDSSize = 0; bool FlatUsed = false; Index: llvm/test/CodeGen/AMDGPU/agpr-register-count.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -1,15 +1,134 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s -declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) +; GCN-LABEL: {{^}}kernel_32_agprs: +; GCN: .amdhsa_next_free_vgpr 32 +; GCN: NumVgprs: 9 +; GCN: NumAgprs: 32 +; GCN: TotalNumVgprs: 32 +; GCN: VGPRBlocks: 7 +; GCN: NumVGPRsForWavesPerEU: 32 +; GCN: Occupancy: 8 +define amdgpu_kernel void @kernel_32_agprs() { +bb: + call void asm sideeffect "", "~{v8}" () + call void asm sideeffect "", "~{a31}" () + ret void +} + +; GCN-LABEL: {{^}}kernel_0_agprs: +; GCN: .amdhsa_next_free_vgpr 1 +; GCN: NumVgprs: 1 +; GCN: NumAgprs: 0 +; GCN: TotalNumVgprs: 1 +; GCN: VGPRBlocks: 0 +; GCN: NumVGPRsForWavesPerEU: 1 +; GCN: Occupancy: 10 +define amdgpu_kernel void @kernel_0_agprs() { +bb: + call void asm sideeffect "", "~{v0}" () + ret void +} + +; GCN-LABEL: {{^}}kernel_40_vgprs: +; GCN: .amdhsa_next_free_vgpr 40 +; GCN: NumVgprs: 40 +; GCN: NumAgprs: 16 +; GCN: TotalNumVgprs: 40 +; GCN: VGPRBlocks: 9 +; GCN: NumVGPRsForWavesPerEU: 40 +; GCN: Occupancy: 6 +define amdgpu_kernel void @kernel_40_vgprs() { +bb: + call void asm sideeffect "", "~{v39}" () + call void asm sideeffect "", "~{a15}" () + ret void +} + +; GCN-LABEL: {{^}}func_32_agprs: +; GCN: NumVgprs: 9 +; GCN: NumAgprs: 32 +; GCN: TotalNumVgprs: 32 +define void @func_32_agprs() #0 { +bb: + call void asm sideeffect "", "~{v8}" () + call void asm sideeffect "", "~{a31}" () + ret void +} + +; GCN-LABEL: {{^}}func_32_vgprs: +; GCN: NumVgprs: 32 +; GCN: NumAgprs: 9 +; GCN: TotalNumVgprs: 32 +define void @func_32_vgprs() { +bb: + call void asm sideeffect "", "~{v31}" () + call void asm sideeffect "", "~{a8}" () + ret void +} -; GCN-LABEL: {{^}}test_32_agprs: -; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, -; GCN-NOT: v28 -; GCN: NumVgprs: 32 -; GCN: VGPRBlocks: 7 -define amdgpu_kernel void @test_32_agprs(<32 x float> addrspace(1)* %arg) { +; GCN-LABEL: {{^}}func_0_agprs: +; GCN: NumVgprs: 1 +; GCN: NumAgprs: 0 +; GCN: TotalNumVgprs: 1 +define amdgpu_kernel void @func_0_agprs() { bb: - %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) - store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + call void asm sideeffect "", "~{v0}" () ret void } + +; GCN-LABEL: {{^}}kernel_max_gprs: +; GCN: .amdhsa_next_free_vgpr 256 +; GCN: NumVgprs: 256 +; GCN: NumAgprs: 256 +; GCN: TotalNumVgprs: 256 +; GCN: VGPRBlocks: 63 +; GCN: NumVGPRsForWavesPerEU: 256 +; GCN: Occupancy: 1 +define amdgpu_kernel void @kernel_max_gprs() { +bb: + call void asm sideeffect "", "~{v255}" () + call void asm sideeffect "", "~{a255}" () + ret void +} + +; GCN-LABEL: {{^}}kernel_call_func_32_agprs: +; GCN: .amdhsa_next_free_vgpr 32 +; GCN: NumVgprs: 9 +; GCN: NumAgprs: 32 +; GCN: TotalNumVgprs: 32 +; GCN: VGPRBlocks: 7 +; GCN: NumVGPRsForWavesPerEU: 32 +; GCN: Occupancy: 8 +define amdgpu_kernel void @kernel_call_func_32_agprs() { +bb: + call void @func_32_agprs() #0 + ret void +} + +; GCN-LABEL: {{^}}func_call_func_32_agprs: +; GCN: NumVgprs: 9 +; GCN: NumAgprs: 32 +; GCN: TotalNumVgprs: 32 +define void @func_call_func_32_agprs() { +bb: + call void @func_32_agprs() #0 + ret void +} + +declare void @undef_func() + +; GCN-LABEL: {{^}}kernel_call_undef_func: +; GCN: .amdhsa_next_free_vgpr 24 +; GCN: NumVgprs: 24 +; GCN: NumAgprs: 24 +; GCN: TotalNumVgprs: 24 +; GCN: VGPRBlocks: 5 +; GCN: NumVGPRsForWavesPerEU: 24 +; GCN: Occupancy: 10 +define amdgpu_kernel void @kernel_call_undef_func() { +bb: + call void @undef_func() + ret void +} + +attributes #0 = { nounwind noinline }