Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -109,12 +109,13 @@ TS->EmitDirectiveHSACodeObjectVersion(2, 1); const MCSubtargetInfo *STI = TM.getMCSubtargetInfo(); - AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(STI->getFeatureBits()); TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); // Emit runtime metadata. - TS->EmitRuntimeMetadata(M); + TS->EmitRuntimeMetadata(STI->getFeatureBits(), M); } bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( @@ -458,6 +459,7 @@ } unsigned ExtraSGPRs = 0; + unsigned ExtraVGPRs = 0; if (VCCUsed) ExtraSGPRs = 2; @@ -473,11 +475,10 @@ ExtraSGPRs = 6; } - // Record first reserved register and reserved register count fields, and - // update max register counts if "amdgpu-debugger-reserve-regs" attribute was - // requested. - ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; - ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); + // Record first reserved VGPR and number of reserved VGPRs. + ExtraVGPRs = STM.getReservedNumVGPRs(MF); + ProgInfo.ReservedVGPRFirst = ExtraVGPRs > 0 ? MaxVGPR + 1 : 0; + ProgInfo.ReservedVGPRCount = ExtraVGPRs; // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" @@ -499,7 +500,8 @@ DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "addressable scalar registers", MaxSGPR + 1, DS_Error, - DK_ResourceLimit, MaxAddressableNumSGPRs); + DK_ResourceLimit, + MaxAddressableNumSGPRs); Ctx.diagnose(Diag); MaxSGPR = MaxAddressableNumSGPRs - 1; } @@ -507,7 +509,7 @@ // Account for extra SGPRs and VGPRs reserved for debugger use. MaxSGPR += ExtraSGPRs; - MaxVGPR += STM.getReservedNumVGPRs(MF); + MaxVGPR += ExtraVGPRs; // We found the maximum register index. They start at 0, so add one to get the // number of registers. @@ -523,25 +525,27 @@ if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { - unsigned MaxNumSGPRs = STM.getAddressableNumSGPRs(); - if (ProgInfo.NumSGPR > MaxNumSGPRs) { + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); + if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm to use the // registers which are usually reserved for vcc etc. - LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "scalar registers", ProgInfo.NumSGPR, DS_Error, - DK_ResourceLimit, MaxNumSGPRs); + DK_ResourceLimit, + MaxAddressableNumSGPRs); Ctx.diagnose(Diag); - ProgInfo.NumSGPR = MaxNumSGPRs; - ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs; + ProgInfo.NumSGPR = MaxAddressableNumSGPRs; + ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; } } if (STM.hasSGPRInitBug()) { - ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + ProgInfo.NumSGPR = + AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + ProgInfo.NumSGPRsForWavesPerEU = + AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; } if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { Index: lib/Target/AMDGPU/AMDGPURuntimeMetadata.h =================================================================== --- lib/Target/AMDGPU/AMDGPURuntimeMetadata.h +++ lib/Target/AMDGPU/AMDGPURuntimeMetadata.h @@ -41,37 +41,90 @@ // Version and revision of runtime metadata const unsigned char MDVersion = 2; - const unsigned char MDRevision = 0; + const unsigned char MDRevision = 1; // Name of keys for runtime metadata. namespace KeyName { - const char MDVersion[] = "amd.MDVersion"; // Runtime metadata version - const char Language[] = "amd.Language"; // Language - const char LanguageVersion[] = "amd.LanguageVersion"; // Language version - const char Kernels[] = "amd.Kernels"; // Kernels - const char KernelName[] = "amd.KernelName"; // Kernel name - const char Args[] = "amd.Args"; // Kernel arguments - const char ArgSize[] = "amd.ArgSize"; // Kernel arg size - const char ArgAlign[] = "amd.ArgAlign"; // Kernel arg alignment - const char ArgTypeName[] = "amd.ArgTypeName"; // Kernel type name - const char ArgName[] = "amd.ArgName"; // Kernel name - const char ArgKind[] = "amd.ArgKind"; // Kernel argument kind - const char ArgValueType[] = "amd.ArgValueType"; // Kernel argument value type - const char ArgAddrQual[] = "amd.ArgAddrQual"; // Kernel argument address qualifier - const char ArgAccQual[] = "amd.ArgAccQual"; // Kernel argument access qualifier - const char ArgIsConst[] = "amd.ArgIsConst"; // Kernel argument is const qualified - const char ArgIsRestrict[] = "amd.ArgIsRestrict"; // Kernel argument is restrict qualified - const char ArgIsVolatile[] = "amd.ArgIsVolatile"; // Kernel argument is volatile qualified - const char ArgIsPipe[] = "amd.ArgIsPipe"; // Kernel argument is pipe qualified - const char ReqdWorkGroupSize[] = "amd.ReqdWorkGroupSize"; // Required work group size - const char WorkGroupSizeHint[] = "amd.WorkGroupSizeHint"; // Work group size hint - const char VecTypeHint[] = "amd.VecTypeHint"; // Vector type hint - const char KernelIndex[] = "amd.KernelIndex"; // Kernel index for device enqueue - const char NoPartialWorkGroups[] = "amd.NoPartialWorkGroups"; // No partial work groups - const char PrintfInfo[] = "amd.PrintfInfo"; // Prinf function call information - const char ArgActualAcc[] = "amd.ArgActualAcc"; // The actual kernel argument access qualifier - const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; // Alignment of pointee type + // Runtime metadata version + const char MDVersion[] = "amd.MDVersion"; + + // Instruction set architecture information + const char IsaInfo[] = "amd.IsaInfo"; + // Wavefront size + const char IsaInfoWavefrontSize[] = "amd.IsaInfoWavefrontSize"; + // Local memory size in bytes + const char IsaInfoLocalMemorySize[] = "amd.IsaInfoLocalMemorySize"; + // Number of execution units per compute unit + const char IsaInfoEUsPerCU[] = "amd.IsaInfoEUsPerCU"; + // Maximum number of waves per execution unit + const char IsaInfoMaxWavesPerEU[] = "amd.IsaInfoMaxWavesPerEU"; + // Maximum flat work group size + const char IsaInfoMaxFlatWorkGroupSize[] = "amd.IsaInfoMaxFlatWorkGroupSize"; + // SGPR allocation granularity + const char IsaInfoSGPRAllocGranule[] = "amd.IsaInfoSGPRAllocGranule"; + // Total number of SGPRs + const char IsaInfoTotalNumSGPRs[] = "amd.IsaInfoTotalNumSGPRs"; + // Addressable number of SGPRs + const char IsaInfoAddressableNumSGPRs[] = "amd.IsaInfoAddressableNumSGPRs"; + // VGPR allocation granularity + const char IsaInfoVGPRAllocGranule[] = "amd.IsaInfoVGPRAllocGranule"; + // Total number of VGPRs + const char IsaInfoTotalNumVGPRs[] = "amd.IsaInfoTotalNumVGPRs"; + // Addressable number of VGPRs + const char IsaInfoAddressableNumVGPRs[] = "amd.IsaInfoAddressableNumVGPRs"; + + // Language + const char Language[] = "amd.Language"; + // Language version + const char LanguageVersion[] = "amd.LanguageVersion"; + + // Kernels + const char Kernels[] = "amd.Kernels"; + // Kernel name + const char KernelName[] = "amd.KernelName"; + // Kernel arguments + const char Args[] = "amd.Args"; + // Kernel argument size in bytes + const char ArgSize[] = "amd.ArgSize"; + // Kernel argument alignment + const char ArgAlign[] = "amd.ArgAlign"; + // Kernel argument type name + const char ArgTypeName[] = "amd.ArgTypeName"; + // Kernel argument name + const char ArgName[] = "amd.ArgName"; + // Kernel argument kind + const char ArgKind[] = "amd.ArgKind"; + // Kernel argument value type + const char ArgValueType[] = "amd.ArgValueType"; + // Kernel argument address qualifier + const char ArgAddrQual[] = "amd.ArgAddrQual"; + // Kernel argument access qualifier + const char ArgAccQual[] = "amd.ArgAccQual"; + // Kernel argument is const qualified + const char ArgIsConst[] = "amd.ArgIsConst"; + // Kernel argument is restrict qualified + const char ArgIsRestrict[] = "amd.ArgIsRestrict"; + // Kernel argument is volatile qualified + const char ArgIsVolatile[] = "amd.ArgIsVolatile"; + // Kernel argument is pipe qualified + const char ArgIsPipe[] = "amd.ArgIsPipe"; + // Required work group size + const char ReqdWorkGroupSize[] = "amd.ReqdWorkGroupSize"; + // Work group size hint + const char WorkGroupSizeHint[] = "amd.WorkGroupSizeHint"; + // Vector type hint + const char VecTypeHint[] = "amd.VecTypeHint"; + // Kernel index for device enqueue + const char KernelIndex[] = "amd.KernelIndex"; + // No partial work groups + const char NoPartialWorkGroups[] = "amd.NoPartialWorkGroups"; + // Prinf function call information + const char PrintfInfo[] = "amd.PrintfInfo"; + // The actual kernel argument access qualifier + const char ArgActualAcc[] = "amd.ArgActualAcc"; + // Alignment of pointee type + const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; } // end namespace KeyName @@ -175,11 +228,45 @@ } // end namespace Kernel + namespace IsaInfo { + + /// \brief In-memory representation of instruction set architecture + /// information. + struct Metadata { + /// \brief Wavefront size. + unsigned WavefrontSize = 0; + /// \brief Local memory size in bytes. + unsigned LocalMemorySize = 0; + /// \brief Number of execution units per compute unit. + unsigned EUsPerCU = 0; + /// \brief Maximum number of waves per execution unit. + unsigned MaxWavesPerEU = 0; + /// \brief Maximum flat work group size. + unsigned MaxFlatWorkGroupSize = 0; + /// \brief SGPR allocation granularity. + unsigned SGPRAllocGranule = 0; + /// \brief Total number of SGPRs. + unsigned TotalNumSGPRs = 0; + /// \brief Addressable number of SGPRs. + unsigned AddressableNumSGPRs = 0; + /// \brief VGPR allocation granularity. + unsigned VGPRAllocGranule = 0; + /// \brief Total number of VGPRs. + unsigned TotalNumVGPRs = 0; + /// \brief Addressable number of VGPRs. + unsigned AddressableNumVGPRs = 0; + + Metadata() = default; + }; + + } // end namespace IsaInfo + namespace Program { // In-memory representation of program information. struct Metadata { std::vector MDVersionSeq; + IsaInfo::Metadata IsaInfo; std::vector PrintfInfo; std::vector Kernels; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -365,72 +365,71 @@ return true; } + void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} + bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} + /// \returns Number of execution units per compute unit supported by the /// subtarget. unsigned getEUsPerCU() const { - return 4; + return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits()); } /// \returns Maximum number of work groups per compute unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { - if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 8; - return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16; + return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerCU() const { - return getMaxWavesPerEU() * getEUsPerCU(); + return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits()); } /// \returns Maximum number of waves per compute unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { - return getWavesPerWorkGroup(FlatWorkGroupSize); + return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Minimum number of waves per execution unit supported by the /// subtarget. unsigned getMinWavesPerEU() const { - return 1; + return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits()); } /// \returns Maximum number of waves per execution unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerEU() const { - if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 8; - // FIXME: Need to take scratch memory into account. - return 10; + return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits()); } /// \returns Maximum number of waves per execution unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { - return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) / - getEUsPerCU(); + return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Minimum flat work group size supported by the subtarget. unsigned getMinFlatWorkGroupSize() const { - return 1; + return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits()); } /// \returns Maximum flat work group size supported by the subtarget. unsigned getMaxFlatWorkGroupSize() const { - return 2048; + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits()); } - /// \returns Number of waves per work group given the flat work group size. + /// \returns Number of waves per work group supported by the subtarget and + /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { - return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize(); + return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(), + FlatWorkGroupSize); } - void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} - bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} - /// \returns Subtarget's default pair of minimum/maximum flat work group sizes /// for function \p F, or minimum/maximum flat work group sizes explicitly /// requested using "amdgpu-flat-work-group-size" attribute attached to @@ -492,13 +491,6 @@ }; class SISubtarget final : public AMDGPUSubtarget { -public: - enum { - // The closed Vulkan driver sets 96, which limits the wave count to 8 but - // doesn't spill SGPRs as much as when 80 is set. - FIXED_SGPR_COUNT_FOR_INIT_BUG = 96 - }; - private: SIInstrInfo InstrInfo; SIFrameLowering FrameLowering; @@ -644,39 +636,36 @@ /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { - if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 16; - return 8; + return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits()); } /// \returns SGPR encoding granularity supported by the subtarget. unsigned getSGPREncodingGranule() const { - return 8; + return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits()); } /// \returns Total number of SGPRs supported by the subtarget. unsigned getTotalNumSGPRs() const { - if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 800; - return 512; + return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits()); } /// \returns Addressable number of SGPRs supported by the subtarget. unsigned getAddressableNumSGPRs() const { - if (hasSGPRInitBug()) - return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - if (getGeneration() >= VOLCANIC_ISLANDS) - return 102; - return 104; + return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits()); } /// \returns Minimum number of SGPRs that meets the given number of waves per /// execution unit requirement supported by the subtarget. - unsigned getMinNumSGPRs(unsigned WavesPerEU) const; + unsigned getMinNumSGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU); + } /// \returns Maximum number of SGPRs that meets the given number of waves per /// execution unit requirement supported by the subtarget. - unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const; + unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { + return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU, + Addressable); + } /// \returns Reserved number of SGPRs for given function \p MF. unsigned getReservedNumSGPRs(const MachineFunction &MF) const; @@ -693,31 +682,35 @@ /// \returns VGPR allocation granularity supported by the subtarget. unsigned getVGPRAllocGranule() const { - return 4; + return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());; } /// \returns VGPR encoding granularity supported by the subtarget. unsigned getVGPREncodingGranule() const { - return getVGPRAllocGranule(); + return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits()); } /// \returns Total number of VGPRs supported by the subtarget. unsigned getTotalNumVGPRs() const { - return 256; + return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits()); } /// \returns Addressable number of VGPRs supported by the subtarget. unsigned getAddressableNumVGPRs() const { - return getTotalNumVGPRs(); + return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits()); } /// \returns Minimum number of VGPRs that meets given number of waves per /// execution unit requirement supported by the subtarget. - unsigned getMinNumVGPRs(unsigned WavesPerEU) const; + unsigned getMinNumVGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU); + } /// \returns Maximum number of VGPRs that meets given number of waves per /// execution unit requirement supported by the subtarget. - unsigned getMaxNumVGPRs(unsigned WavesPerEU) const; + unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU); + } /// \returns Reserved number of VGPRs for given function \p MF. unsigned getReservedNumVGPRs(const MachineFunction &MF) const; Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -325,53 +325,6 @@ return 1; } -unsigned SISubtarget::getMinNumSGPRs(unsigned WavesPerEU) const { - if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 0; - case 8: return 81; - default: return 97; - } - } else { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 49; - case 8: return 57; - case 7: return 65; - case 6: return 73; - case 5: return 81; - default: return 97; - } - } -} - -unsigned SISubtarget::getMaxNumSGPRs(unsigned WavesPerEU, - bool Addressable) const { - if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - switch (WavesPerEU) { - case 0: return 80; - case 10: return 80; - case 9: return 80; - case 8: return 96; - default: return Addressable ? getAddressableNumSGPRs() : 112; - } - } else { - switch (WavesPerEU) { - case 0: return 48; - case 10: return 48; - case 9: return 56; - case 8: return 64; - case 7: return 72; - case 6: return 80; - case 5: return 96; - default: return getAddressableNumSGPRs(); - } - } -} - unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo(); if (MFI.hasFlatScratchInit()) { @@ -430,44 +383,12 @@ } if (hasSGPRInitBug()) - MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), MaxAddressableNumSGPRs); } -unsigned SISubtarget::getMinNumVGPRs(unsigned WavesPerEU) const { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 25; - case 8: return 29; - case 7: return 33; - case 6: return 37; - case 5: return 41; - case 4: return 49; - case 3: return 65; - case 2: return 85; - default: return 129; - } -} - -unsigned SISubtarget::getMaxNumVGPRs(unsigned WavesPerEU) const { - switch (WavesPerEU) { - case 0: return 24; - case 10: return 24; - case 9: return 28; - case 8: return 32; - case 7: return 36; - case 6: return 40; - case 5: return 48; - case 4: return 64; - case 3: return 84; - case 2: return 128; - default: return getTotalNumVGPRs(); - } -} - unsigned SISubtarget::getReservedNumVGPRs(const MachineFunction &MF) const { if (debuggerReserveRegs()) return 4; Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -801,14 +801,16 @@ // Currently there is none suitable machinery in the core llvm-mc for this. // MCSymbol::isRedefinable is intended for another purpose, and // AsmParser::parseDirectiveSet() cannot be specialized for specific target. - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getSTI().getFeatureBits()); MCContext &Ctx = getContext(); - MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx)); + MCSymbol *Sym = + Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx)); + Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx)); + Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); } KernelScope.initialize(getContext()); } @@ -1867,9 +1869,10 @@ // If this directive has no arguments, then use the ISA version for the // targeted GPU. if (getLexer().is(AsmToken::EndOfStatement)) { - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); - getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor, - Isa.Stepping, + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getSTI().getFeatureBits()); + getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, + ISA.Stepping, "AMD", "AMDGPU"); return false; } @@ -2455,13 +2458,14 @@ if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) Parser.Lex(); - IsaVersion IV = getIsaVersion(getSTI().getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getSTI().getFeatureBits()); if (CntName == "vmcnt") - IntVal = encodeVmcnt(IV, IntVal, CntVal); + IntVal = encodeVmcnt(ISA, IntVal, CntVal); else if (CntName == "expcnt") - IntVal = encodeExpcnt(IV, IntVal, CntVal); + IntVal = encodeExpcnt(ISA, IntVal, CntVal); else if (CntName == "lgkmcnt") - IntVal = encodeLgkmcnt(IV, IntVal, CntVal); + IntVal = encodeLgkmcnt(ISA, IntVal, CntVal); else return true; @@ -2470,8 +2474,9 @@ OperandMatchResultTy AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { - IsaVersion IV = getIsaVersion(getSTI().getFeatureBits()); - int64_t Waitcnt = getWaitcntBitMask(IV); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getSTI().getFeatureBits()); + int64_t Waitcnt = getWaitcntBitMask(ISA); SMLoc S = Parser.getTok().getLoc(); switch(getLexer().getKind()) { Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -1057,27 +1057,28 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - IsaVersion IV = getIsaVersion(STI.getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits()); unsigned SImm16 = MI->getOperand(OpNo).getImm(); unsigned Vmcnt, Expcnt, Lgkmcnt; - decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt); + decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt); bool NeedSpace = false; - if (Vmcnt != getVmcntBitMask(IV)) { + if (Vmcnt != getVmcntBitMask(ISA)) { O << "vmcnt(" << Vmcnt << ')'; NeedSpace = true; } - if (Expcnt != getExpcntBitMask(IV)) { + if (Expcnt != getExpcntBitMask(ISA)) { if (NeedSpace) O << ' '; O << "expcnt(" << Expcnt << ')'; NeedSpace = true; } - if (Lgkmcnt != getLgkmcntBitMask(IV)) { + if (Lgkmcnt != getLgkmcntBitMask(ISA)) { if (NeedSpace) O << ' '; O << "lgkmcnt(" << Lgkmcnt << ')'; Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h @@ -17,10 +17,12 @@ #include namespace llvm { +class FeatureBitset; class Module; // Get runtime metadata as YAML string. -std::string getRuntimeMDYAMLString(Module &M); +std::string getRuntimeMDYAMLString(const FeatureBitset &Features, + const Module &M); } #endif Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPURuntimeMetadata.h" #include "MCTargetDesc/AMDGPURuntimeMD.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -92,9 +93,30 @@ static const bool flow = true; }; +template <> struct MappingTraits { + static void mapping(IO &YamlIO, IsaInfo::Metadata &I) { + YamlIO.mapRequired(KeyName::IsaInfoWavefrontSize, I.WavefrontSize); + YamlIO.mapRequired(KeyName::IsaInfoLocalMemorySize, I.LocalMemorySize); + YamlIO.mapRequired(KeyName::IsaInfoEUsPerCU, I.EUsPerCU); + YamlIO.mapRequired(KeyName::IsaInfoMaxWavesPerEU, I.MaxWavesPerEU); + YamlIO.mapRequired(KeyName::IsaInfoMaxFlatWorkGroupSize, + I.MaxFlatWorkGroupSize); + YamlIO.mapRequired(KeyName::IsaInfoSGPRAllocGranule, I.SGPRAllocGranule); + YamlIO.mapRequired(KeyName::IsaInfoTotalNumSGPRs, I.TotalNumSGPRs); + YamlIO.mapRequired(KeyName::IsaInfoAddressableNumSGPRs, + I.AddressableNumSGPRs); + YamlIO.mapRequired(KeyName::IsaInfoVGPRAllocGranule, I.VGPRAllocGranule); + YamlIO.mapRequired(KeyName::IsaInfoTotalNumVGPRs, I.TotalNumVGPRs); + YamlIO.mapRequired(KeyName::IsaInfoAddressableNumVGPRs, + I.AddressableNumVGPRs); + } + static const bool flow = true; +}; + template <> struct MappingTraits { static void mapping(IO &YamlIO, Program::Metadata &Prog) { YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq); + YamlIO.mapRequired(KeyName::IsaInfo, Prog.IsaInfo); YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo); YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels); } @@ -383,10 +405,27 @@ } } -std::string llvm::getRuntimeMDYAMLString(Module &M) { +std::string llvm::getRuntimeMDYAMLString(const FeatureBitset &Features, + const Module &M) { Program::Metadata Prog; Prog.MDVersionSeq.push_back(MDVersion); Prog.MDVersionSeq.push_back(MDRevision); + Prog.IsaInfo.WavefrontSize = AMDGPU::IsaInfo::getWavefrontSize(Features); + Prog.IsaInfo.LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(Features); + Prog.IsaInfo.EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(Features); + Prog.IsaInfo.MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(Features); + Prog.IsaInfo.MaxFlatWorkGroupSize = + AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(Features); + Prog.IsaInfo.SGPRAllocGranule = + AMDGPU::IsaInfo::getSGPRAllocGranule(Features); + Prog.IsaInfo.TotalNumSGPRs = AMDGPU::IsaInfo::getTotalNumSGPRs(Features); + Prog.IsaInfo.AddressableNumSGPRs = + AMDGPU::IsaInfo::getAddressableNumSGPRs(Features); + Prog.IsaInfo.VGPRAllocGranule = + AMDGPU::IsaInfo::getVGPRAllocGranule(Features); + Prog.IsaInfo.TotalNumVGPRs = AMDGPU::IsaInfo::getTotalNumVGPRs(Features); + Prog.IsaInfo.AddressableNumVGPRs = + AMDGPU::IsaInfo::getAddressableNumVGPRs(Features); // Set PrintfInfo. if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) { Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -17,6 +17,7 @@ #include "AMDGPUPTNote.h" class DataLayout; +class FeatureBitset; class Function; class MCELFStreamer; class MCSymbol; @@ -46,7 +47,8 @@ virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; - virtual void EmitRuntimeMetadata(Module &M) = 0; + virtual void EmitRuntimeMetadata(const FeatureBitset &Features, + const Module &M) = 0; virtual void EmitRuntimeMetadata(StringRef Metadata) = 0; }; @@ -70,7 +72,8 @@ void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; - void EmitRuntimeMetadata(Module &M) override; + void EmitRuntimeMetadata(const FeatureBitset &Features, + const Module &M) override; void EmitRuntimeMetadata(StringRef Metadata) override; }; @@ -101,7 +104,8 @@ void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; - void EmitRuntimeMetadata(Module &M) override; + void EmitRuntimeMetadata(const FeatureBitset &Features, + const Module &M) override; void EmitRuntimeMetadata(StringRef Metadata) override; }; Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -93,9 +93,10 @@ OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; } -void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) { +void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(const FeatureBitset &Features, + const Module &M) { OS << "\t.amdgpu_runtime_metadata\n"; - OS << getRuntimeMDYAMLString(M); + OS << getRuntimeMDYAMLString(Features, M); OS << "\n\t.end_amdgpu_runtime_metadata\n"; } @@ -236,6 +237,7 @@ ); } -void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) { - EmitRuntimeMetadata(getRuntimeMDYAMLString(M)); +void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(const FeatureBitset &Features, + const Module &M) { + EmitRuntimeMetadata(getRuntimeMDYAMLString(Features, M)); } Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -47,7 +47,6 @@ #define DEBUG_TYPE "si-insert-waits" using namespace llvm; -using namespace llvm::AMDGPU; namespace { @@ -76,7 +75,7 @@ const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI; - IsaVersion IV; + AMDGPU::IsaInfo::IsaVersion ISA; /// \brief Constant zero value static const Counters ZeroCounts; @@ -427,10 +426,10 @@ // Build the wait instruction BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(encodeWaitcnt(IV, - Counts.Named.VM, - Counts.Named.EXP, - Counts.Named.LGKM)); + .addImm(AMDGPU::encodeWaitcnt(ISA, + Counts.Named.VM, + Counts.Named.EXP, + Counts.Named.LGKM)); LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -458,9 +457,9 @@ unsigned Imm = I->getOperand(0).getImm(); Counters Counts, WaitOn; - Counts.Named.VM = decodeVmcnt(IV, Imm); - Counts.Named.EXP = decodeExpcnt(IV, Imm); - Counts.Named.LGKM = decodeLgkmcnt(IV, Imm); + Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm); + Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm); + Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm); for (unsigned i = 0; i < 3; ++i) { if (Counts.Array[i] <= LastIssued.Array[i]) @@ -534,12 +533,12 @@ TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - IV = getIsaVersion(ST->getFeatureBits()); + ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); const SIMachineFunctionInfo *MFI = MF.getInfo(); - HardwareLimits.Named.VM = getVmcntBitMask(IV); - HardwareLimits.Named.EXP = getExpcntBitMask(IV); - HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV); + HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA); + HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA); + HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA); WaitedOn = ZeroCounts; DelayedWaitOn = ZeroCounts; Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -34,16 +34,118 @@ namespace AMDGPU { -LLVM_READONLY -int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); +namespace IsaInfo { +enum { + // The closed Vulkan driver sets 96, which limits the wave count to 8 but + // doesn't spill SGPRs as much as when 80 is set. + FIXED_NUM_SGPRS_FOR_INIT_BUG = 96 +}; + +/// \brief Instruction set architecture version. struct IsaVersion { unsigned Major; unsigned Minor; unsigned Stepping; }; +/// \returns Isa version for given subtarget \p Features. IsaVersion getIsaVersion(const FeatureBitset &Features); + +/// \returns Wavefront size for given subtarget \p Features. +unsigned getWavefrontSize(const FeatureBitset &Features); + +/// \returns Local memory size in bytes for given subtarget \p Features. +unsigned getLocalMemorySize(const FeatureBitset &Features); + +/// \returns Number of execution units per compute unit for given subtarget \p +/// Features. +unsigned getEUsPerCU(const FeatureBitset &Features); + +/// \returns Maximum number of work groups per compute unit for given subtarget +/// \p Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Maximum number of waves per compute unit for given subtarget \p +/// Features without any kind of limitation. +unsigned getMaxWavesPerCU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per compute unit for given subtarget \p +/// Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWavesPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Minimum number of waves per execution unit for given subtarget \p +/// Features. +unsigned getMinWavesPerEU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per execution unit for given subtarget \p +/// Features without any kind of limitation. +unsigned getMaxWavesPerEU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per execution unit for given subtarget \p +/// Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWavesPerEU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Minimum flat work group size for given subtarget \p Features. +unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features); + +/// \returns Maximum flat work group size for given subtarget \p Features. +unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features); + +/// \returns Number of waves per work group for given subtarget \p Features and +/// limited by given \p FlatWorkGroupSize. +unsigned getWavesPerWorkGroup(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns SGPR allocation granularity for given subtarget \p Features. +unsigned getSGPRAllocGranule(const FeatureBitset &Features); + +/// \returns SGPR encoding granularity for given subtarget \p Features. +unsigned getSGPREncodingGranule(const FeatureBitset &Features); + +/// \returns Total number of SGPRs for given subtarget \p Features. +unsigned getTotalNumSGPRs(const FeatureBitset &Features); + +/// \returns Addressable number of SGPRs for given subtarget \p Features. +unsigned getAddressableNumSGPRs(const FeatureBitset &Features); + +/// \returns Minimum number of SGPRs that meets the given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU); + +/// \returns Maximum number of SGPRs that meets the given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, + bool Addressable); + +/// \returns VGPR allocation granularity for given subtarget \p Features. +unsigned getVGPRAllocGranule(const FeatureBitset &Features); + +/// \returns VGPR encoding granularity for given subtarget \p Features. +unsigned getVGPREncodingGranule(const FeatureBitset &Features); + +/// \returns Total number of VGPRs for given subtarget \p Features. +unsigned getTotalNumVGPRs(const FeatureBitset &Features); + +/// \returns Addressable number of VGPRs for given subtarget \p Features. +unsigned getAddressableNumVGPRs(const FeatureBitset &Features); + +/// \returns Minimum number of VGPRs that meets given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); + +/// \returns Maximum number of VGPRs that meets given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); + +} // namespace IsaInfo + +LLVM_READONLY +int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features); MCSection *getHSATextSection(MCContext &Ctx); @@ -84,26 +186,26 @@ std::pair Default, bool OnlyFirstRequired = false); -/// \returns Waitcnt bit mask for given isa \p Version. -unsigned getWaitcntBitMask(IsaVersion Version); - /// \returns Vmcnt bit mask for given isa \p Version. -unsigned getVmcntBitMask(IsaVersion Version); +unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Expcnt bit mask for given isa \p Version. -unsigned getExpcntBitMask(IsaVersion Version); +unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Lgkmcnt bit mask for given isa \p Version. -unsigned getLgkmcntBitMask(IsaVersion Version); +unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version); + +/// \returns Waitcnt bit mask for given isa \p Version. +unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and @@ -113,17 +215,20 @@ /// \p Vmcnt = \p Waitcnt[3:0] /// \p Expcnt = \p Waitcnt[6:4] /// \p Lgkmcnt = \p Waitcnt[11:8] -void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt, +void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version. -unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt); +unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Vmcnt); /// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version. -unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt); +unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Expcnt); /// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version. -unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt); +unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Lgkmcnt); /// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa /// \p Version. @@ -135,7 +240,7 @@ /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given /// isa \p Version. -unsigned encodeWaitcnt(IsaVersion Version, +unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); unsigned getInitialPSInputAddr(const Function &F); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -76,47 +76,240 @@ /// \returns Lgkmcnt bit width. unsigned getLgkmcntBitWidth() { return 4; } -} // anonymous namespace +} // namespace anonymous namespace llvm { namespace AMDGPU { -IsaVersion getIsaVersion(const FeatureBitset &Features) { +namespace IsaInfo { +IsaVersion getIsaVersion(const FeatureBitset &Features) { + // CI. if (Features.test(FeatureISAVersion7_0_0)) return {7, 0, 0}; - if (Features.test(FeatureISAVersion7_0_1)) return {7, 0, 1}; - if (Features.test(FeatureISAVersion7_0_2)) return {7, 0, 2}; + // VI. if (Features.test(FeatureISAVersion8_0_0)) return {8, 0, 0}; - if (Features.test(FeatureISAVersion8_0_1)) return {8, 0, 1}; - if (Features.test(FeatureISAVersion8_0_2)) return {8, 0, 2}; - if (Features.test(FeatureISAVersion8_0_3)) return {8, 0, 3}; - if (Features.test(FeatureISAVersion8_0_4)) return {8, 0, 4}; - if (Features.test(FeatureISAVersion8_1_0)) return {8, 1, 0}; - return {0, 0, 0}; + if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands)) + return {0, 0, 0}; + return {7, 0, 0}; +} + +unsigned getWavefrontSize(const FeatureBitset &Features) { + if (Features.test(FeatureWavefrontSize16)) + return 16; + if (Features.test(FeatureWavefrontSize32)) + return 32; + + return 64; +} + +unsigned getLocalMemorySize(const FeatureBitset &Features) { + if (Features.test(FeatureLocalMemorySize32768)) + return 32768; + if (Features.test(FeatureLocalMemorySize65536)) + return 65536; + + return 0; +} + +unsigned getEUsPerCU(const FeatureBitset &Features) { + return 4; +} + +unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + if (!Features.test(FeatureGCN)) + return 8; + return getWavesPerWorkGroup(Features, FlatWorkGroupSize) == 1 ? 40 : 16; +} + +unsigned getMaxWavesPerCU(const FeatureBitset &Features) { + return getMaxWavesPerEU(Features) * getEUsPerCU(Features); +} + +unsigned getMaxWavesPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return getWavesPerWorkGroup(Features, FlatWorkGroupSize); +} + +unsigned getMinWavesPerEU(const FeatureBitset &Features) { + return 1; +} + +unsigned getMaxWavesPerEU(const FeatureBitset &Features) { + if (!Features.test(FeatureGCN)) + return 8; + // FIXME: Need to take scratch memory into account. + return 10; +} + +unsigned getMaxWavesPerEU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize), + getEUsPerCU(Features)) / getEUsPerCU(Features); +} + +unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) { + return 1; +} + +unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) { + return 2048; +} + +unsigned getWavesPerWorkGroup(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) / + getWavefrontSize(Features); +} + +unsigned getSGPRAllocGranule(const FeatureBitset &Features) { + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 16; + return 8; +} + +unsigned getSGPREncodingGranule(const FeatureBitset &Features) { + return 8; +} + +unsigned getTotalNumSGPRs(const FeatureBitset &Features) { + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 800; + return 512; } +unsigned getAddressableNumSGPRs(const FeatureBitset &Features) { + if (Features.test(FeatureSGPRInitBug)) + return FIXED_NUM_SGPRS_FOR_INIT_BUG; + + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 102; + return 104; +} + +unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) { + switch (WavesPerEU) { + case 0: return 0; + case 10: return 0; + case 9: return 0; + case 8: return 81; + default: return 97; + } + } else { + switch (WavesPerEU) { + case 0: return 0; + case 10: return 0; + case 9: return 49; + case 8: return 57; + case 7: return 65; + case 6: return 73; + case 5: return 81; + default: return 97; + } + } +} + +unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, + bool Addressable) { + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) { + switch (WavesPerEU) { + case 0: return 80; + case 10: return 80; + case 9: return 80; + case 8: return 96; + default: return Addressable ? getAddressableNumSGPRs(Features) : 112; + } + } else { + switch (WavesPerEU) { + case 0: return 48; + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return getAddressableNumSGPRs(Features); + } + } +} + +unsigned getVGPRAllocGranule(const FeatureBitset &Features) { + return 4; +} + +unsigned getVGPREncodingGranule(const FeatureBitset &Features) { + return getVGPRAllocGranule(Features); +} + +unsigned getTotalNumVGPRs(const FeatureBitset &Features) { + return 256; +} + +unsigned getAddressableNumVGPRs(const FeatureBitset &Features) { + return getTotalNumVGPRs(Features); +} + +unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + switch (WavesPerEU) { + case 0: return 0; + case 10: return 0; + case 9: return 25; + case 8: return 29; + case 7: return 33; + case 6: return 37; + case 5: return 41; + case 4: return 49; + case 3: return 65; + case 2: return 85; + default: return 129; + } +} + +unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + switch (WavesPerEU) { + case 0: return 24; + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return getTotalNumVGPRs(Features); + } +} + +} // namespace IsaInfo + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features) { - - IsaVersion ISA = getIsaVersion(Features); + IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features); memset(&Header, 0, sizeof(Header)); @@ -224,57 +417,60 @@ return Ints; } -unsigned getWaitcntBitMask(IsaVersion Version) { - unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth()); - unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); - unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); - return Vmcnt | Expcnt | Lgkmcnt; -} - -unsigned getVmcntBitMask(IsaVersion Version) { +unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) { return (1 << getVmcntBitWidth()) - 1; } -unsigned getExpcntBitMask(IsaVersion Version) { +unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) { return (1 << getExpcntBitWidth()) - 1; } -unsigned getLgkmcntBitMask(IsaVersion Version) { +unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) { return (1 << getLgkmcntBitWidth()) - 1; } -unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) { +unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) { + unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth()); + unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); + unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); + return Vmcnt | Expcnt | Lgkmcnt; +} + +unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); } -unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) { +unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); } -unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) { +unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); } -void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt, +void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) { Vmcnt = decodeVmcnt(Version, Waitcnt); Expcnt = decodeExpcnt(Version, Waitcnt); Lgkmcnt = decodeLgkmcnt(Version, Waitcnt); } -unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) { +unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Vmcnt) { return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); } -unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) { +unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Expcnt) { return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); } -unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) { +unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Lgkmcnt) { return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); } -unsigned encodeWaitcnt(IsaVersion Version, +unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) { unsigned Waitcnt = getWaitcntBitMask(Version); Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt); Index: test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll =================================================================== --- test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll +++ test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s ; check llc does not crash for invalid opencl version metadata -; CHECK: { amd.MDVersion: [ 2, 0 ] } +; CHECK: { amd.MDVersion: [ 2, 1 ], amd.IsaInfo: { amd.IsaInfoWavefrontSize: 64, amd.IsaInfoLocalMemorySize: 65536, amd.IsaInfoEUsPerCU: 4, amd.IsaInfoMaxWavesPerEU: 10, amd.IsaInfoMaxFlatWorkGroupSize: 2048, amd.IsaInfoSGPRAllocGranule: 8, amd.IsaInfoTotalNumSGPRs: 512, amd.IsaInfoAddressableNumSGPRs: 104, amd.IsaInfoVGPRAllocGranule: 4, amd.IsaInfoTotalNumVGPRs: 256, amd.IsaInfoAddressableNumVGPRs: 256 } } !opencl.ocl.version = !{} Index: test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll =================================================================== --- test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll +++ test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s ; check llc does not crash for invalid opencl version metadata -; CHECK: { amd.MDVersion: [ 2, 0 ] } +; CHECK: { amd.MDVersion: [ 2, 1 ], amd.IsaInfo: { amd.IsaInfoWavefrontSize: 64, amd.IsaInfoLocalMemorySize: 65536, amd.IsaInfoEUsPerCU: 4, amd.IsaInfoMaxWavesPerEU: 10, amd.IsaInfoMaxFlatWorkGroupSize: 2048, amd.IsaInfoSGPRAllocGranule: 8, amd.IsaInfoTotalNumSGPRs: 512, amd.IsaInfoAddressableNumSGPRs: 104, amd.IsaInfoVGPRAllocGranule: 4, amd.IsaInfoTotalNumVGPRs: 256, amd.IsaInfoAddressableNumVGPRs: 256 } } !opencl.ocl.version = !{!0} !0 = !{} Index: test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll =================================================================== --- test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll +++ test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s ; check llc does not crash for invalid opencl version metadata -; CHECK: { amd.MDVersion: [ 2, 0 ] } +; CHECK: { amd.MDVersion: [ 2, 1 ], amd.IsaInfo: { amd.IsaInfoWavefrontSize: 64, amd.IsaInfoLocalMemorySize: 65536, amd.IsaInfoEUsPerCU: 4, amd.IsaInfoMaxWavesPerEU: 10, amd.IsaInfoMaxFlatWorkGroupSize: 2048, amd.IsaInfoSGPRAllocGranule: 8, amd.IsaInfoTotalNumSGPRs: 512, amd.IsaInfoAddressableNumSGPRs: 104, amd.IsaInfoVGPRAllocGranule: 4, amd.IsaInfoTotalNumVGPRs: 256, amd.IsaInfoAddressableNumVGPRs: 256 } } !opencl.ocl.version = !{!0} !0 = !{i32 1} Index: test/CodeGen/AMDGPU/runtime-metadata.ll =================================================================== --- test/CodeGen/AMDGPU/runtime-metadata.ll +++ test/CodeGen/AMDGPU/runtime-metadata.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata -elf-output-style=GNU -notes | FileCheck %s --check-prefix=NOTES +; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata -elf-output-style=GNU -notes | FileCheck %s --check-prefix=NOTES --check-prefix=SI +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata -elf-output-style=GNU -notes | FileCheck %s --check-prefix=NOTES --check-prefix=VI ; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -amdgpu-dump-rtmd -amdgpu-check-rtmd-parser %s -o - 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=PARSER %s %struct.A = type { i8, float } @@ -11,9 +12,10 @@ %opencl.clk_event_t = type opaque ; CHECK: --- -; CHECK-NEXT: { amd.MDVersion: [ 2, 0 ], amd.PrintfInfo: [ '1:1:4:%d\n', '2:1:8:%g\n' ], amd.Kernels: +; SI: { amd.MDVersion: [ 2, 1 ], amd.IsaInfo: { amd.IsaInfoWavefrontSize: 64, amd.IsaInfoLocalMemorySize: 65536, amd.IsaInfoEUsPerCU: 4, amd.IsaInfoMaxWavesPerEU: 10, amd.IsaInfoMaxFlatWorkGroupSize: 2048, amd.IsaInfoSGPRAllocGranule: 8, amd.IsaInfoTotalNumSGPRs: 512, amd.IsaInfoAddressableNumSGPRs: 104, amd.IsaInfoVGPRAllocGranule: 4, amd.IsaInfoTotalNumVGPRs: 256, amd.IsaInfoAddressableNumVGPRs: 256 }, amd.PrintfInfo: [ '1:1:4:%d\n', '2:1:8:%g\n' ], amd.Kernels: +; VI: { amd.MDVersion: [ 2, 1 ], amd.IsaInfo: { amd.IsaInfoWavefrontSize: 64, amd.IsaInfoLocalMemorySize: 65536, amd.IsaInfoEUsPerCU: 4, amd.IsaInfoMaxWavesPerEU: 10, amd.IsaInfoMaxFlatWorkGroupSize: 2048, amd.IsaInfoSGPRAllocGranule: 16, amd.IsaInfoTotalNumSGPRs: 800, amd.IsaInfoAddressableNumSGPRs: 102, amd.IsaInfoVGPRAllocGranule: 4, amd.IsaInfoTotalNumVGPRs: 256, amd.IsaInfoAddressableNumVGPRs: 256 }, amd.PrintfInfo: [ '1:1:4:%d\n', '2:1:8:%g\n' ], amd.Kernels: -; CHECK-NEXT: - { amd.KernelName: test_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: +; CHECK: - { amd.KernelName: test_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: ; CHECK-NEXT: - { amd.ArgSize: 1, amd.ArgAlign: 1, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char, amd.ArgAccQual: 0 } ; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 } ; CHECK-NEXT: - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 } @@ -345,7 +347,9 @@ ; NOTES-NEXT: Owner Data size Description ; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001) ; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003) -; NOTES-NEXT: AMD 0x00005196 Unknown note type: (0x00000008) + +; SI: AMD 0x0000530d Unknown note type: (0x00000008) +; VI: AMD 0x0000530e Unknown note type: (0x00000008) !llvm.printf.fmts = !{!100, !101}