diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -625,13 +625,8 @@ return 5; } -unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { - unsigned MaxWaves = getMaxWavesPerEU(); - unsigned Granule = getVGPRAllocGranule(); - if (VGPRs < Granule) - return MaxWaves; - unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; - return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); +unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { + return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); } unsigned diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1216,14 +1216,14 @@ return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); } - /// \returns Minimum number of VGPRs that meets given number of waves per - /// execution unit requirement supported by the subtarget. + /// \returns the minimum number of VGPRs that will prevent achieving more than + /// the specified number of waves \p WavesPerEU. unsigned getMinNumVGPRs(unsigned WavesPerEU) const { return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); } - /// \returns Maximum number of VGPRs that meets given number of waves per - /// execution unit requirement supported by the subtarget. + /// \returns the maximum number of VGPRs that can be used and still achieved + /// at least the specified number of waves \p WavesPerEU. unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -286,6 +286,11 @@ /// execution unit requirement for given subtarget \p STI. unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); +/// \returns Number of waves reachable for a given \p NumVGPRs usage for given +/// subtarget \p STI. +unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, + unsigned NumVGPRs); + /// \returns Number of VGPR blocks needed for given subtarget \p STI when /// \p NumVGPRs are used. /// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1011,15 +1011,39 @@ return 256; } +unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, + unsigned NumVGPRs) { + unsigned MaxWaves = getMaxWavesPerEU(STI); + unsigned Granule = getVGPRAllocGranule(STI); + if (NumVGPRs < Granule) + return MaxWaves; + unsigned RoundedRegs = alignTo(NumVGPRs, Granule); + return std::min(std::max(getTotalNumVGPRs(STI) / RoundedRegs, 1u), MaxWaves); +} + unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU(STI)) + unsigned MaxNumWaves = getMaxWavesPerEU(STI); + if (WavesPerEU >= MaxNumWaves) return 0; - unsigned MinNumVGPRs = - alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1), - getVGPRAllocGranule(STI)) + 1; - return std::min(MinNumVGPRs, getAddressableNumVGPRs(STI)); + + unsigned NumVGPRs = getTotalNumVGPRs(STI); + unsigned Granule = getVGPRAllocGranule(STI); + unsigned MaxNumVGPRs = alignDown(NumVGPRs / WavesPerEU, Granule); + + unsigned NumWaves = getNumWavesPerEUWithNumVGPRs(STI, MaxNumVGPRs); + if (NumWaves >= MaxNumWaves) + return 0; + + unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI); + unsigned MinNumWaves = getNumWavesPerEUWithNumVGPRs(STI, AddressableNumVGPRs); + if (NumWaves < MinNumWaves) + return getMinNumVGPRs(STI, MinNumWaves); + + unsigned MaxNumVGPRsNext = alignDown(NumVGPRs / (WavesPerEU + 1), Granule); + unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext); + return std::min(MinNumVGPRs, AddressableNumVGPRs); } unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -40,7 +40,7 @@ ; GCN-LABEL: {{^}}limited_occupancy_19: ; GFX9: ; Occupancy: 10 -; GFX1010: ; Occupancy: 18 +; GFX1010: ; Occupancy: 20 ; GFX1030: ; Occupancy: 16 ; GFX1100: ; Occupancy: 16 define amdgpu_kernel void @limited_occupancy_19() #2 { diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h new file mode 100644 --- /dev/null +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h @@ -0,0 +1,25 @@ +//===---------- llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H +#define LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H + +#include +#include + +namespace llvm { + +class GCNTargetMachine; +class StringRef; + +std::unique_ptr +createAMDGPUTargetMachine(std::string TStr, StringRef CPU, StringRef FS); + +} // end namespace llvm + +#endif // LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp @@ -0,0 +1,156 @@ +//===--------- llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUUnitTests.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/TargetParser.h" +#include "llvm/Support/TargetSelect.h" +#include "gtest/gtest.h" + +#include "AMDGPUGenSubtargetInfo.inc" + +using namespace llvm; + +std::once_flag flag; + +void InitializeAMDGPUTarget() { + std::call_once(flag, []() { + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + }); +} + +std::unique_ptr +llvm::createAMDGPUTargetMachine(std::string TStr, StringRef CPU, StringRef FS) { + InitializeAMDGPUTarget(); + + std::string Error; + const Target *T = TargetRegistry::lookupTarget(TStr, Error); + if (!T) + return nullptr; + + TargetOptions Options; + return std::unique_ptr(static_cast( + T->createTargetMachine(TStr, CPU, FS, Options, None, None))); +} + +static cl::opt PrintCpuRegLimits( + "print-cpu-reg-limits", cl::NotHidden, cl::init(false), + cl::desc("force printing per AMDGPU CPU register limits")); + +static bool checkMinMax(std::stringstream &OS, unsigned Occ, unsigned MinOcc, + unsigned MaxOcc, + std::function GetOcc, + std::function GetMinGPRs, + std::function GetMaxGPRs) { + bool MinValid = true, MaxValid = true, RangeValid = true; + unsigned MinGPRs = GetMinGPRs(Occ); + unsigned MaxGPRs = GetMaxGPRs(Occ); + unsigned RealOcc; + + if (MinGPRs >= MaxGPRs) + RangeValid = false; + else { + RealOcc = GetOcc(MinGPRs); + for (unsigned NumRegs = MinGPRs + 1; NumRegs <= MaxGPRs; ++NumRegs) { + if (RealOcc != GetOcc(NumRegs)) { + RangeValid = false; + break; + } + } + } + + if (RangeValid && RealOcc > MinOcc && RealOcc <= MaxOcc) { + if (MinGPRs > 0 && GetOcc(MinGPRs - 1) <= RealOcc) + MinValid = false; + + if (GetOcc(MaxGPRs + 1) >= RealOcc) + MaxValid = false; + } + + std::stringstream MinStr; + MinStr << (MinValid ? ' ' : '<') << ' ' << std::setw(3) << MinGPRs << " (O" + << GetOcc(MinGPRs) << ") " << (RangeValid ? ' ' : 'R'); + + OS << std::left << std::setw(15) << MinStr.str() << std::setw(3) << MaxGPRs + << " (O" << GetOcc(MaxGPRs) << ')' << (MaxValid ? "" : " >"); + + return MinValid && MaxValid && RangeValid; +} + +static const std::pair + EmptyFS = {"", ""}, + W32FS = {"+wavefrontsize32", "w32"}, + W64FS = {"+wavefrontsize64", "w64"}; + +static void testGPRLimits( + const char *RegName, bool TestW32W64, + std::function test) { + SmallVector CPUs; + AMDGPU::fillValidArchListAMDGCN(CPUs); + + std::map> TablePerCPUs; + for (auto CPUName : CPUs) { + auto CanonCPUName = + AMDGPU::getArchNameAMDGCN(AMDGPU::parseArchAMDGCN(CPUName)); + + auto *FS = &EmptyFS; + while (true) { + auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS->first); + if (!TM) + break; + + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + + if (TestW32W64 && + ST.getFeatureBits().test(AMDGPU::FeatureWavefrontSize32)) + FS = &W32FS; + + std::stringstream Table; + bool Success = true; + unsigned MaxOcc = ST.getMaxWavesPerEU(); + for (unsigned Occ = MaxOcc; Occ > 0; --Occ) { + Table << std::right << std::setw(3) << Occ << " "; + Success = test(Table, Occ, ST) && Success; + Table << '\n'; + } + if (!Success || PrintCpuRegLimits) + TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str()); + + if (FS != &W32FS) + break; + + FS = &W64FS; + } + } + std::stringstream OS; + for (auto &P : TablePerCPUs) { + for (auto &CPUName : P.second) + OS << ' ' << CPUName; + OS << ":\nOcc Min" << RegName << " Max" << RegName << '\n' + << P.first << '\n'; + } + auto ErrStr = OS.str(); + EXPECT_TRUE(ErrStr.empty()) << ErrStr; +} + +TEST(AMDGPU, TestVGPRLimitsPerOccupancy) { + testGPRLimits("VGPR", true, [](std::stringstream &OS, unsigned Occ, + GCNSubtarget &ST) { + unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(); + return checkMinMax( + OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(), + [&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); }, + [&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); }, + [&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); }); + }); +} diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt --- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -14,6 +14,7 @@ ) add_llvm_target_unittest(AMDGPUTests + AMDGPUUnitTests.cpp DwarfRegMappings.cpp ExecMayBeModifiedBeforeAnyUse.cpp ) diff --git a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp --- a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp +++ b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp @@ -6,46 +6,16 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/MC/MCTargetOptions.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" +#include "AMDGPUUnitTests.h" #include "gtest/gtest.h" -#include using namespace llvm; -std::once_flag flag; - -void InitializeAMDGPUTarget() { - std::call_once(flag, []() { - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - }); -} - -std::unique_ptr -createTargetMachine(std::string TStr, StringRef CPU, StringRef FS) { - InitializeAMDGPUTarget(); - - std::string Error; - const Target *T = TargetRegistry::lookupTarget(TStr, Error); - if (!T) - return nullptr; - - TargetOptions Options; - return std::unique_ptr(static_cast( - T->createTargetMachine(TStr, CPU, FS, Options, None, None))); -} - -TEST(AMDGPUDwarfRegMappingTests, TestWave64DwarfRegMapping) { +TEST(AMDGPU, TestWave64DwarfRegMapping) { for (auto Triple : {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) { - auto TM = createTargetMachine(Triple, "gfx1010", "+wavefrontsize64"); + auto TM = createAMDGPUTargetMachine(Triple, "gfx1010", "+wavefrontsize64"); if (TM) { GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), std::string(TM->getTargetFeatureString()), *TM); @@ -65,10 +35,10 @@ } } -TEST(AMDGPUDwarfRegMappingTests, TestWave32DwarfRegMapping) { +TEST(AMDGPU, TestWave32DwarfRegMapping) { for (auto Triple : {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) { - auto TM = createTargetMachine(Triple, "gfx1010", "+wavefrontsize32"); + auto TM = createAMDGPUTargetMachine(Triple, "gfx1010", "+wavefrontsize32"); if (TM) { GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), std::string(TM->getTargetFeatureString()), *TM); diff --git a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp --- a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp +++ b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp @@ -7,25 +7,14 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "AMDGPUUnitTests.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/MC/MCTargetOptions.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" #include "gtest/gtest.h" -#include using namespace llvm; -// implementation is in the llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp -std::unique_ptr -createTargetMachine(std::string TStr, StringRef CPU, StringRef FS); - -TEST(AMDGPUExecMayBeModifiedBeforeAnyUse, TheTest) { - auto TM = createTargetMachine("amdgcn-amd-", "gfx906", ""); +TEST(AMDGPU, ExecMayBeModifiedBeforeAnyUse) { + auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx906", ""); if (!TM) return;