diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -625,13 +625,8 @@ return 5; } -unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { - unsigned MaxWaves = getMaxWavesPerEU(); - unsigned Granule = getVGPRAllocGranule(); - if (VGPRs < Granule) - return MaxWaves; - unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; - return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); +unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { + return AMDGPU::IsaInfo::getNumWavesWithNumVGPRs(this, NumVGPRs); } unsigned diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -286,6 +286,10 @@ /// execution unit requirement for given subtarget \p STI. unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); +/// \returns Number of waves reachable for a given \p NumVGPRs usage for given +/// subtarget \p STI. +unsigned getNumWavesWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs); + /// \returns Number of VGPR blocks needed for given subtarget \p STI when /// \p NumVGPRs are used. /// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -986,15 +986,39 @@ return 256; } +unsigned getNumWavesWithNumVGPRs(const MCSubtargetInfo *STI, + unsigned NumVGPRs) { + unsigned MaxWaves = getMaxWavesPerEU(STI); + unsigned Granule = getVGPRAllocGranule(STI); + if (NumVGPRs < Granule) + return MaxWaves; + unsigned RoundedRegs = ((NumVGPRs + Granule - 1) / Granule) * Granule; + return std::min(std::max(getTotalNumVGPRs(STI) / RoundedRegs, 1u), MaxWaves); +} + unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU(STI)) + unsigned MaxNumWaves = getMaxWavesPerEU(STI); + if (WavesPerEU >= MaxNumWaves) return 0; - unsigned MinNumVGPRs = - alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1), - getVGPRAllocGranule(STI)) + 1; - return std::min(MinNumVGPRs, getAddressableNumVGPRs(STI)); + + unsigned NumVGPRs = getTotalNumVGPRs(STI); + unsigned Granule = getVGPRAllocGranule(STI); + unsigned MaxNumVGPRs = alignDown(NumVGPRs / WavesPerEU, Granule); + + unsigned NumWaves = getNumWavesWithNumVGPRs(STI, MaxNumVGPRs); + if (NumWaves >= MaxNumWaves) + return 0; + + unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI); + unsigned MinNumWaves = getNumWavesWithNumVGPRs(STI, AddressableNumVGPRs); + if (NumWaves < MinNumWaves) + return getMinNumVGPRs(STI, MinNumWaves); + + unsigned MaxNumVGPRsNext = alignDown(NumVGPRs / (WavesPerEU + 1), Granule); + unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext); + return std::min(MinNumVGPRs, AddressableNumVGPRs); } unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -40,7 +40,7 @@ ; GCN-LABEL: {{^}}limited_occupancy_19: ; GFX9: ; Occupancy: 10 -; GFX1010: ; Occupancy: 18 +; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @limited_occupancy_19() #2 { diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp @@ -0,0 +1,131 @@ +//===- llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/TargetParser.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; + +static cl::opt PrintCpuRegLimits( + "print-cpu-reg-limits", cl::NotHidden, cl::init(false), + cl::desc("force printing per AMDGPU CPU register limits")); + +// implementation is in the llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp +std::unique_ptr +createTargetMachine(std::string TStr, StringRef CPU, StringRef FS); + +bool checkMinMax(std::stringstream &OS, unsigned Occ, unsigned MinOcc, + unsigned MaxOcc, std::function GetOcc, + std::function GetMinGPRs, + std::function GetMaxGPRs) { + bool MinValid = true, MaxValid = true, RangeValid = true; + unsigned MinGPRs = GetMinGPRs(Occ); + unsigned MaxGPRs = GetMaxGPRs(Occ); + unsigned RealOcc; + + if (MinGPRs >= MaxGPRs) + RangeValid = false; + else { + RealOcc = GetOcc(MinGPRs); + for (unsigned NumRegs = MinGPRs + 1; NumRegs <= MaxGPRs; ++NumRegs) { + if (RealOcc != GetOcc(NumRegs)) { + RangeValid = false; + break; + } + } + } + + if (RangeValid && RealOcc > MinOcc && RealOcc <= MaxOcc) { + if (MinGPRs > 0 && GetOcc(MinGPRs - 1) <= RealOcc) + MinValid = false; + + if (GetOcc(MaxGPRs + 1) >= RealOcc) + MaxValid = false; + } + + std::stringstream MinStr; + MinStr << (MinValid ? ' ' : '<') << ' ' << std::setw(3) << MinGPRs << " (O" + << GetOcc(MinGPRs) << ") " << (RangeValid ? ' ' : 'R'); + + OS << std::left << std::setw(15) << MinStr.str() << std::setw(3) << MaxGPRs + << " (O" << GetOcc(MaxGPRs) << ')' << (MaxValid ? "" : " >"); + + return MinValid && MaxValid && RangeValid; +} + +bool hasW32(StringRef CanonicCPUName) { + return CanonicCPUName.starts_with("gfx10") || + CanonicCPUName.starts_with("gfx11"); +} + +static std::vector> + EmptySet = {{"", ""}}, + W32W64 = {{"+wavefrontsize32", "w32"}, {"+wavefrontsize64", "w64"}}; + +void testGPRLimits( + const char *RegName, bool TestW32W64, + std::function test) { + SmallVector CPUs; + AMDGPU::fillValidArchListAMDGCN(CPUs); + + std::map> TablePerCPUs; + for (auto CPUName : CPUs) { + auto CanonCPUName = + AMDGPU::getArchNameAMDGCN(AMDGPU::parseArchAMDGCN(CPUName)); + + auto *AttrSet = (TestW32W64 && hasW32(CanonCPUName)) ? &W32W64 : &EmptySet; + for (auto &P : *AttrSet) { + auto TM = createTargetMachine("amdgcn-amd-", CPUName, P.first); + if (!TM) + continue; + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + std::stringstream Table; + bool Success = true; + unsigned MaxOcc = ST.getMaxWavesPerEU(); + for (unsigned Occ = MaxOcc; Occ > 0; --Occ) { + Table << std::right << std::setw(3) << Occ << " "; + Success = test(Table, Occ, ST) && Success; + Table << '\n'; + } + if (!Success || PrintCpuRegLimits) + TablePerCPUs[Table.str()].push_back((CanonCPUName + P.second).str()); + } + } + std::stringstream OS; + for (auto &P : TablePerCPUs) { + for (auto &CPUName : P.second) + OS << ' ' << CPUName; + OS << ":\nOcc Min" << RegName << " Max" << RegName << '\n' + << P.first << '\n'; + } + auto ErrStr = OS.str(); + EXPECT_TRUE(ErrStr.empty()) << ErrStr; +} + +TEST(AMDGPU, TestVGPRLimitsPerOccupancy) { + testGPRLimits("VGPR", true, [](std::stringstream &OS, unsigned Occ, + GCNSubtarget &ST) { + unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(); + return checkMinMax( + OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(), + [&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); }, + [&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); }, + [&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); }); + }); +} diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt --- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -14,6 +14,7 @@ ) add_llvm_target_unittest(AMDGPUTests + AMDGPUUnitTests.cpp DwarfRegMappings.cpp ExecMayBeModifiedBeforeAnyUse.cpp )