Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -54,7 +54,7 @@ uint32_t NumVGPR = 0; uint32_t NumSGPR = 0; - uint32_t LDSSize; + uint32_t LDSSize = 0; bool FlatUsed = false; // Number of SGPRs that meets number of waves per execution unit request. @@ -84,11 +84,11 @@ // Bonus information for debugging. bool VCCUsed = false; - uint64_t CodeLen = 0; SIProgramInfo() = default; }; + uint64_t getFunctionCodeSize(const MachineFunction &MF) const; void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, const MachineFunction &MF) const; Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -218,8 +218,8 @@ if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { OutStreamer->emitRawComment(" Kernel info:", false); - OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), - false); + OutStreamer->emitRawComment(" codeLenInByte = " + + Twine(getFunctionCodeSize(MF)), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), @@ -357,18 +357,12 @@ } } -void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - const MachineFunction &MF) const { +uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { const SISubtarget &STM = MF.getSubtarget(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - uint64_t CodeSize = 0; - unsigned MaxSGPR = 0; - unsigned MaxVGPR = 0; - bool VCCUsed = false; - bool FlatUsed = false; - const SIRegisterInfo *RI = STM.getRegisterInfo(); const SIInstrInfo *TII = STM.getInstrInfo(); + uint64_t CodeSize = 0; + for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. @@ -377,122 +371,86 @@ if (MI.isDebugValue()) continue; - if (isVerbose()) - CodeSize += TII->getInstSizeInBytes(MI); + CodeSize += TII->getInstSizeInBytes(MI); + } + } - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - unsigned width = 0; - bool isSGPR = false; + return CodeSize; +} - if (!MO.isReg()) - continue; +static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, + const SIInstrInfo &TII, + unsigned Reg) { + for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { + if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) + return true; + } - unsigned reg = MO.getReg(); - switch (reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT: - continue; + return false; +} - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - VCCUsed = true; - continue; +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + const MachineFunction &MF) const { + const SISubtarget &STM = MF.getSubtarget(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = STM.getInstrInfo(); + const SIRegisterInfo *RI = &TII->getRegisterInfo(); - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat - // instructions aren't used to access the scratch buffer. - if (MFI->hasFlatScratchInit()) - FlatUsed = true; - continue; - case AMDGPU::TBA: - case AMDGPU::TBA_LO: - case AMDGPU::TBA_HI: - case AMDGPU::TMA: - case AMDGPU::TMA_LO: - case AMDGPU::TMA_HI: - llvm_unreachable("trap handler registers should not be used"); - - default: - break; - } - - if (AMDGPU::SReg_32RegClass.contains(reg)) { - assert(!AMDGPU::TTMP_32RegClass.contains(reg) && - "trap handler registers should not be used"); - isSGPR = true; - width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { - isSGPR = false; - width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(reg)) { - assert(!AMDGPU::TTMP_64RegClass.contains(reg) && - "trap handler registers should not be used"); - isSGPR = true; - width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(reg)) { - isSGPR = false; - width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(reg)) { - isSGPR = false; - width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(reg)) { - isSGPR = true; - width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(reg)) { - isSGPR = false; - width = 4; - } else if (AMDGPU::SReg_256RegClass.contains(reg)) { - isSGPR = true; - width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(reg)) { - isSGPR = false; - width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(reg)) { - isSGPR = true; - width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(reg)) { - isSGPR = false; - width = 16; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned hwReg = RI->getEncodingValue(reg) & 0xff; - unsigned maxUsed = hwReg + width - 1; - if (isSGPR) { - MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; - } else { - MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; - } - } + MCPhysReg NumVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + NumVGPRReg = Reg; + break; + } + } + + MCPhysReg NumSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + NumSGPRReg = Reg; + break; } } + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 : + (RI->getEncodingValue(NumVGPRReg) & 0xff) + 1; + ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 : + (RI->getEncodingValue(NumSGPRReg) & 0xff) + 1; unsigned ExtraSGPRs = 0; - if (VCCUsed) + ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI); + if (ProgInfo.VCCUsed) ExtraSGPRs = 2; + ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); + + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. Inline assembly + // may need it though. + // + // If we only have implicit uses of flat_scr on flat instructions, it is not + // really needed. + if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() && + (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { + ProgInfo.FlatUsed = false; + } + if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { - if (FlatUsed) + if (ProgInfo.FlatUsed) ExtraSGPRs = 4; } else { if (STM.isXNACKEnabled()) ExtraSGPRs = 4; - if (FlatUsed) + if (ProgInfo.FlatUsed) ExtraSGPRs = 6; } @@ -502,34 +460,29 @@ if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && !STM.hasSGPRInitBug()) { unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); - if (MaxSGPR + 1 > MaxAddressableNumSGPRs) { + if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm. LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "addressable scalar registers", - MaxSGPR + 1, DS_Error, + ProgInfo.NumSGPR, DS_Error, DK_ResourceLimit, MaxAddressableNumSGPRs); Ctx.diagnose(Diag); - MaxSGPR = MaxAddressableNumSGPRs - 1; + ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; } } // Account for extra SGPRs and VGPRs reserved for debugger use. - MaxSGPR += ExtraSGPRs; - MaxVGPR += ExtraVGPRs; - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - ProgInfo.NumSGPR = MaxSGPR + 1; - ProgInfo.NumVGPR = MaxVGPR + 1; + ProgInfo.NumSGPR += ExtraSGPRs; + ProgInfo.NumVGPR += ExtraVGPRs; // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. ProgInfo.NumSGPRsForWavesPerEU = std::max( - ProgInfo.NumSGPR, STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); + std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); ProgInfo.NumVGPRsForWavesPerEU = std::max( - ProgInfo.NumVGPR, STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); + std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { @@ -581,7 +534,7 @@ ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1; // Record first reserved VGPR and number of reserved VGPRs. - ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; + ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0; ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and @@ -606,10 +559,6 @@ const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); ProgInfo.ScratchSize = FrameInfo.getStackSize(); - ProgInfo.FlatUsed = FlatUsed; - ProgInfo.VCCUsed = VCCUsed; - ProgInfo.CodeLen = CodeSize; - unsigned LDSAlignShift; if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { // LDS is allocated in 64 dword blocks. Index: test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll =================================================================== --- test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll +++ test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll @@ -1253,8 +1253,8 @@ ; NOTES-NEXT: Owner Data size Description ; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001) ; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003) -; GFX700: AMD 0x00009171 Unknown note type: (0x0000000a) -; GFX800: AMD 0x00009190 Unknown note type: (0x0000000a) -; GFX900: AMD 0x00009171 Unknown note type: (0x0000000a) +; GFX700: AMD 0x00008b06 Unknown note type: (0x0000000a) +; GFX800: AMD 0x00008e6a Unknown note type: (0x0000000a) +; GFX900: AMD 0x00008b06 Unknown note type: (0x0000000a) ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS Index: test/CodeGen/AMDGPU/exceed-max-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -38,7 +38,7 @@ ret void } -; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr +; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () Index: test/CodeGen/AMDGPU/flat-scratch-reg.ll =================================================================== --- test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -44,12 +44,12 @@ ; HSA-VI-NOXNACK: is_xnack_enabled = 0 ; HSA-VI-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 8 -; VI-NOXNACK: ; NumSgprs: 8 -; VI-XNACK: ; NumSgprs: 12 -; HSA-CI: ; NumSgprs: 8 -; HSA-VI-NOXNACK: ; NumSgprs: 8 -; HSA-VI-XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 14 +; HSA-VI-XNACK: ; NumSgprs: 14 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() @@ -60,14 +60,49 @@ ; HSA-NOXNACK: is_xnack_enabled = 0 ; HSA-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 10 -; VI-NOXNACK: ; NumSgprs: 10 -; VI-XNACK: ; NumSgprs: 12 -; HSA-CI: ; NumSgprs: 10 -; HSA-VI-NOXNACK: ; NumSgprs: 10 -; HSA-VI-XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 14 +; HSA-VI-XNACK: ; NumSgprs: 14 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() ret void } + +; Make sure used SGPR count for flat_scr is correct when there is no +; scratch usage and implicit flat uses. + +; GCN-LABEL: {{^}}use_flat_scr: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR}"() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scr_lo: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr_lo() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR_LO}"() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scr_hi: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr_hi() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR_HI}"() + ret void +} + +attributes #0 = { nounwind }