diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -441,7 +441,7 @@ assert(isUInt<32>(PI.ScratchSize)); assert(isUInt<32>(PI.getComputePGMRSrc1())); - assert(isUInt<32>(PI.ComputePGMRSrc2)); + assert(isUInt<32>(PI.getComputePGMRSrc2())); KernelDescriptor.group_segment_fixed_size = PI.LDSSize; KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; @@ -450,7 +450,7 @@ KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(); - KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2; + KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(); KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); @@ -579,28 +579,27 @@ OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + - Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:USER_SGPR: " + - Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + - Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_X_EN: " + - Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + - Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + - Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + - Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), - false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " + + Twine(CurrentProgramInfo.ScratchEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(CurrentProgramInfo.UserSGPR), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + + Twine(CurrentProgramInfo.TrapHandlerEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(CurrentProgramInfo.TGIdXEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(CurrentProgramInfo.TGIdYEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(CurrentProgramInfo.TGIdZEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(CurrentProgramInfo.TIdIGCompCount), + false); assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); @@ -922,22 +921,21 @@ // anything to disable it if we know the stack isn't used here. We may still // have emitted code reading it to initialize scratch, but if that's unused // reading garbage should be OK. - const bool EnablePrivateSegment = + ProgInfo.ScratchEnable = ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack; - ProgInfo.ComputePGMRSrc2 = - S_00B84C_SCRATCH_EN(EnablePrivateSegment) | - S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | - // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. - S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | - S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | - S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | - S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | - S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | - S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | - S_00B84C_EXCP_EN_MSB(0) | - // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. - S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | - S_00B84C_EXCP_EN(0); + ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); + // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. + ProgInfo.TrapHandlerEnable = + STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); + ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); + ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); + ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); + ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo(); + ProgInfo.TIdIGCompCount = TIDIGCompCnt; + ProgInfo.EXCPEnMSB = 0; + // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. + ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; + ProgInfo.EXCPEnable = 0; if (STM.hasGFX90AInsts()) { AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, @@ -978,7 +976,7 @@ OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1()); OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); - OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); + OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2()); OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); OutStreamer->emitInt32( @@ -1038,25 +1036,87 @@ } MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); - MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); - if (AMDGPU::isCompute(CC)) { - MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2); + if (MD->getPALMajorVersion() < 3) { + MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); + if (AMDGPU::isCompute(CC)) { + MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2()); + } else { + if (CurrentProgramInfo.ScratchBlocks > 0) + MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); + } } else { - if (CurrentProgramInfo.ScratchBlocks > 0) - MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); + // Priority? + MD->setHwStage(CC, ".float_mode", CurrentProgramInfo.FloatMode); + // Priv? + // DX10Clamp? + MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); + MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode); + MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode); + MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered); + + if (AMDGPU::isCompute(CC)) { + MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable); + MD->setHwStage(CC, ".trap_present", + (bool)CurrentProgramInfo.TrapHandlerEnable); + + // Compute registers + // If the front-end has set tgid_x/y/z_en - assert that the + // CurrentProgramInfo is consistent (usually set with function attributes + // amdgpu-no-workgroup-id-x etc.). + assert(MD->checkComputeRegisters(".tgid_x_en", + (bool)CurrentProgramInfo.TGIdXEnable)); + assert(MD->checkComputeRegisters(".tgid_y_en", + (bool)CurrentProgramInfo.TGIdYEnable)); + assert(MD->checkComputeRegisters(".tgid_z_en", + (bool)CurrentProgramInfo.TGIdZEnable)); + + // EXCPEnMSB? + const unsigned LdsDwGranularity = 128; + MD->setHwStage(CC, ".lds_size", + (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity * + sizeof(uint32_t))); + MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); + } else { + MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable); + } } + // ScratchSize is in bytes, 16 aligned. MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) : CurrentProgramInfo.LDSBlocks; - MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); - MD->setSpiPsInputEna(MFI->getPSInputEnable()); - MD->setSpiPsInputAddr(MFI->getPSInputAddr()); + if (MD->getPALMajorVersion() < 3) { + MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); + MD->setSpiPsInputEna(MFI->getPSInputEnable()); + MD->setSpiPsInputAddr(MFI->getPSInputAddr()); + } else { + // Graphics registers + MD->setGraphicsRegisters(".ps_extra_lds_size", ExtraLDSSize); + // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr + static StringLiteral const PsInputFields[] = { + ".persp_sample_ena", ".persp_center_ena", + ".persp_centroid_ena", ".persp_pull_model_ena", + ".linear_sample_ena", ".linear_center_ena", + ".linear_centroid_ena", ".line_stipple_tex_ena", + ".pos_x_float_ena", ".pos_y_float_ena", + ".pos_z_float_ena", ".pos_w_float_ena", + ".front_face_ena", ".ancillary_ena", + ".sample_coverage_ena", ".pos_fixed_pt_ena"}; + unsigned PSInputEna = MFI->getPSInputEnable(); + unsigned PSInputAddr = MFI->getPSInputAddr(); + for (auto [Idx, Field] : enumerate(PsInputFields)) { + MD->setGraphicsRegisters(".spi_ps_input_ena", Field, + (bool)((PSInputEna >> Idx) & 1)); + MD->setGraphicsRegisters(".spi_ps_input_addr", Field, + (bool)((PSInputAddr >> Idx) & 1)); + } + } } - if (STM.isWave32()) + // For version 3 and above the wave front size is already set in the metadata + if (MD->getPALMajorVersion() < 3 && STM.isWave32()) MD->setWave32(MF.getFunction().getCallingConv()); } @@ -1068,7 +1128,7 @@ // Set compute registers MD->setRsrc1(CallingConv::AMDGPU_CS, CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS)); - MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2); + MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2()); // Set optional info MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize); @@ -1104,7 +1164,7 @@ Out.compute_pgm_resource_registers = CurrentProgramInfo.getComputePGMRSrc1() | - (CurrentProgramInfo.ComputePGMRSrc2 << 32); + (CurrentProgramInfo.getComputePGMRSrc2() << 32); Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; if (CurrentProgramInfo.DynamicCallStack) diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -36,11 +36,23 @@ uint32_t MemOrdered = 0; // GFX10+ uint64_t ScratchSize = 0; - // Fields set in PGM_RSRC2 pm4 packet. + // State used to calculate fields set in PGM_RSRC2 pm4 packet. uint32_t LDSBlocks = 0; uint32_t ScratchBlocks = 0; - uint64_t ComputePGMRSrc2 = 0; + // Fields set in PGM_RSRC2 pm4 packet + uint32_t ScratchEnable = 0; + uint32_t UserSGPR = 0; + uint32_t TrapHandlerEnable = 0; + uint32_t TGIdXEnable = 0; + uint32_t TGIdYEnable = 0; + uint32_t TGIdZEnable = 0; + uint32_t TGSizeEnable = 0; + uint32_t TIdIGCompCount = 0; + uint32_t EXCPEnMSB = 0; + uint32_t LdsSize = 0; + uint32_t EXCPEnable = 0; + uint64_t ComputePGMRSrc3GFX90A = 0; uint32_t NumVGPR = 0; @@ -75,6 +87,10 @@ /// Compute the value of the ComputePGMRsrc1 register. uint64_t getComputePGMRSrc1() const; uint64_t getPGMRSrc1(CallingConv::ID CC) const; + + /// Compute the value of the ComputePGMRsrc2 register. + uint64_t getComputePGMRSrc2() const; + uint64_t getPGMRSrc2(CallingConv::ID CC) const; }; } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -54,3 +54,23 @@ } return Reg; } + +uint64_t SIProgramInfo::getComputePGMRSrc2() const { + uint64_t Reg = + S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) | + S_00B84C_TRAP_HANDLER(TrapHandlerEnable) | + S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) | + S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) | + S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) | + S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) | + S_00B84C_EXCP_EN(EXCPEnable); + + return Reg; +} + +uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const { + if (AMDGPU::isCompute(CC)) + return getComputePGMRSrc2(); + + return 0; +} diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -27,6 +27,11 @@ msgpack::DocNode Registers; msgpack::DocNode HwStages; msgpack::DocNode ShaderFunctions; + bool VersionChecked = false; + msgpack::DocNode Version; + // From PAL version >= 3.0 + msgpack::DocNode ComputeRegisters; + msgpack::DocNode GraphicsRegisters; public: // Read the amdgpu.pal.metadata supplied by the frontend, ready for @@ -129,6 +134,26 @@ // Set legacy PAL metadata format. void setLegacy(); + unsigned getPALMajorVersion(); + unsigned getPALMinorVersion(); + + void setHwStage(unsigned CC, StringRef field, unsigned Val); + void setHwStage(unsigned CC, StringRef field, bool Val); + + void setComputeRegisters(StringRef field, unsigned Val); + void setComputeRegisters(StringRef field, bool Val); + + // If the field does not exist will return nullptr rather than creating a new + // entry (which is the behaviour of the other functions). + msgpack::DocNode *refComputeRegister(StringRef field); + bool checkComputeRegisters(StringRef field, unsigned Val); + bool checkComputeRegisters(StringRef field, bool Val); + + void setGraphicsRegisters(StringRef field, unsigned Val); + void setGraphicsRegisters(StringRef field, bool Val); + void setGraphicsRegisters(StringRef field1, StringRef field2, unsigned Val); + void setGraphicsRegisters(StringRef field1, StringRef field2, bool Val); + // Erase all PAL metadata. void reset(); @@ -151,10 +176,29 @@ // Get (create if necessary) a function in the shader functions map. msgpack::MapDocNode getShaderFunction(StringRef Name); + // Reference (create if necessary) the node for the compute_registers map. + msgpack::DocNode &refComputeRegisters(); + + // Get (create if necessary) the .compute_registers entry. + msgpack::MapDocNode getComputeRegisters(); + + // Reference (create if necessary) the node for the graphics registers map. + msgpack::DocNode &refGraphicsRegisters(); + + // Get (create if necessary) the .graphics_registers entry. + msgpack::MapDocNode getGraphicsRegisters(); + + // Reference (create if necessary) the node for the hardware_stages map. + msgpack::DocNode &refHwStage(); + // Get (create if necessary) the .hardware_stages entry for the given calling // convention. msgpack::MapDocNode getHwStage(unsigned CC); + // Get the PAL version major (idx 0) or minor (idx 1). This is an internal + // helper for the public wrapper functions that request Major or Minor + unsigned getPALVersion(unsigned idx); + bool setFromLegacyBlob(StringRef Blob); bool setFromMsgPackBlob(StringRef Blob); void toLegacyBlob(std::string &Blob); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -811,6 +811,38 @@ return Functions[Name].getMap(/*Convert=*/true); } +msgpack::DocNode &AMDGPUPALMetadata::refComputeRegisters() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".compute_registers")]; + N.getMap(/*Convert=*/true); + return N; +} + +msgpack::MapDocNode AMDGPUPALMetadata::getComputeRegisters() { + if (ComputeRegisters.isEmpty()) + ComputeRegisters = refComputeRegisters(); + return ComputeRegisters.getMap(); +} + +msgpack::DocNode &AMDGPUPALMetadata::refGraphicsRegisters() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".graphics_registers")]; + N.getMap(/*Convert=*/true); + return N; +} + +msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() { + if (GraphicsRegisters.isEmpty()) + GraphicsRegisters = refGraphicsRegisters(); + return GraphicsRegisters.getMap(); +} + // Return the PAL metadata hardware shader stage name. static const char *getStageName(CallingConv::ID CC) { switch (CC) { @@ -833,15 +865,21 @@ } } +msgpack::DocNode &AMDGPUPALMetadata::refHwStage() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".hardware_stages")]; + N.getMap(/*Convert=*/true); + return N; +} + // Get (create if necessary) the .hardware_stages entry for the given calling // convention. msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) { if (HwStages.isEmpty()) - HwStages = MsgPackDoc.getRoot() - .getMap(/*Convert=*/true)["amdpal.pipelines"] - .getArray(/*Convert=*/true)[0] - .getMap(/*Convert=*/true)[".hardware_stages"] - .getMap(/*Convert=*/true); + HwStages = refHwStage(); return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true); } @@ -874,3 +912,78 @@ Registers = MsgPackDoc.getEmptyNode(); HwStages = MsgPackDoc.getEmptyNode(); } + +unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) { + assert(idx < 2 && + "illegal index to PAL version - should be 0 (major) or 1 (minor)"); + if (!VersionChecked) { + if (Version.isEmpty()) { + auto &M = MsgPackDoc.getRoot().getMap(/*Convert=*/true); + auto I = M.find(MsgPackDoc.getNode("amdpal.version")); + if (I != M.end()) + Version = I->second; + } + VersionChecked = true; + } + if (Version.isEmpty()) + // Default to 2.6 if there's no version info + return idx ? 6 : 2; + return Version.getArray()[idx].getUInt(); +} + +unsigned AMDGPUPALMetadata::getPALMajorVersion() { return getPALVersion(0); } + +unsigned AMDGPUPALMetadata::getPALMinorVersion() { return getPALVersion(1); } + +// Set the field in a given .hardware_stages entry +void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, unsigned Val) { + getHwStage(CC)[field] = Val; +} + +void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) { + getHwStage(CC)[field] = Val; +} + +void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) { + getComputeRegisters()[field] = Val; +} + +void AMDGPUPALMetadata::setComputeRegisters(StringRef field, bool Val) { + getComputeRegisters()[field] = Val; +} + +msgpack::DocNode *AMDGPUPALMetadata::refComputeRegister(StringRef field) { + auto M = getComputeRegisters(); + auto I = M.find(field); + return I == M.end() ? nullptr : &I->second; +} + +bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, unsigned Val) { + if (auto N = refComputeRegister(field)) + return N->getUInt() == Val; + return false; +} + +bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, bool Val) { + if (auto N = refComputeRegister(field)) + return N->getBool() == Val; + return false; +} + +void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, unsigned Val) { + getGraphicsRegisters()[field] = Val; +} + +void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, bool Val) { + getGraphicsRegisters()[field] = Val; +} + +void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2, + unsigned Val) { + getGraphicsRegisters()[field1].getMap(true)[field2] = Val; +} + +void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2, + bool Val) { + getGraphicsRegisters()[field1].getMap(true)[field2] = Val; +} diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -0,0 +1,165 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s + +; CHECK-LABEL: {{^}}_amdgpu_cs_main: +; CHECK: ; NumSgprs: 4 +; CHECK: ; NumVgprs: 2 +; CHECK: .amdgpu_pal_metadata +; CHECK-NEXT: --- +; CHECK-NEXT: amdpal.pipelines: +; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .compute_registers: +; CHECK-NEXT: .tg_size_en: true +; CHECK-NEXT: .tgid_x_en: false +; CHECK-NEXT: .tgid_y_en: false +; CHECK-NEXT: .tgid_z_en: false +; CHECK-NEXT: .tidig_comp_cnt: 0x1 +; CHECK-NEXT: .graphics_registers: +; CHECK-NEXT: .ps_extra_lds_size: 0 +; CHECK-NEXT: .spi_ps_input_addr: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: true +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: true +; CHECK-NEXT: .linear_centroid_ena: true +; CHECK-NEXT: .linear_sample_ena: true +; CHECK-NEXT: .persp_center_ena: true +; CHECK-NEXT: .persp_centroid_ena: true +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: true +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .spi_ps_input_ena: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .hardware_stages: +; CHECK-NEXT: .cs: +; CHECK-NEXT: .checksum_value: 0x9444d7d0 +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_cs_main +; CHECK-NEXT: .excp_en: 0 +; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .image_op: false +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x4 +; CHECK-NEXT: .sgpr_limit: 0x6a +; CHECK-NEXT: .threadgroup_dimensions: +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: - 0x400 +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: .trap_present: false +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x3 +; CHECK-NEXT: .vgpr_count: 0x2 +; CHECK-NEXT: .vgpr_limit: 0x100 +; CHECK-NEXT: .wavefront_size: 0x40 +; CHECK-NEXT: .wgp_mode: false +; CHECK-NEXT: .ps: +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: ps_shader +; CHECK-NEXT: .float_mode: 0xf0 +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0x1 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: .wgp_mode: true +; CHECK: .registers: {} +; CHECK:amdpal.version: +; CHECK-NEXT: - 0x3 +; CHECK-NEXT: - 0 +; CHECK-NEXT:... +; CHECK-NEXT: .end_amdgpu_pal_metadata + +define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 { +.entry: + %i = call i64 @llvm.amdgcn.s.getpc() + %i1 = and i64 %i, -4294967296 + %i2 = zext i32 %arg1 to i64 + %i3 = or i64 %i1, %i2 + %i4 = inttoptr i64 %i3 to ptr addrspace(4) + %i5 = and i32 %arg2, 1023 + %i6 = lshr i32 %arg2, 10 + %i7 = and i32 %i6, 1023 + %i8 = add nuw nsw i32 %i7, %i5 + %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16 + %.idx = shl nuw nsw i32 %i8, 2 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0) + ret void +} + +define dllexport amdgpu_ps void @ps_shader() #1 { + ret void +} + +!amdgpu.pal.metadata.msgpack = !{!0} + +; Function Attrs: nounwind willreturn memory(none) +declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.amdgcn.s.getpc() #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write) +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3 + +attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" } + +attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" } + +!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"} +!1 = !{i32 7}