Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H +#include "AMDKernelCodeT.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/AsmPrinter.h" #include @@ -89,6 +90,8 @@ }; void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, + const MachineFunction &MF) const; void findNumUsedRegistersSI(const MachineFunction &MF, unsigned &NumSGPR, unsigned &NumVGPR) const; @@ -97,8 +100,6 @@ /// can correctly setup the GPU state. void EmitProgramInfoR600(const MachineFunction &MF); void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); - void EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const; public: explicit AMDGPUAsmPrinter(TargetMachine &TM, Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -21,7 +21,6 @@ #include "InstPrinter/AMDGPUInstPrinter.h" #include "Utils/AMDGPUBaseInfo.h" #include "AMDGPU.h" -#include "AMDKernelCodeT.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" @@ -141,14 +140,18 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const AMDGPUSubtarget &STM = MF->getSubtarget(); SIProgramInfo KernelInfo; + amd_kernel_code_t KernelCode; if (STM.isAmdCodeObjectV2(*MF)) { getSIProgramInfo(KernelInfo, *MF); - EmitAmdKernelCodeT(*MF, KernelInfo); + getAmdKernelCode(KernelCode, KernelInfo, *MF); + + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + getTargetStreamer().EmitAMDKernelCodeT(KernelCode); } if (TM.getTargetTriple().getOS() != Triple::AMDHSA) return; - getTargetStreamer().EmitKernelRuntimeMetadata(*MF->getFunction()); + getTargetStreamer().EmitKernelRuntimeMetadata(*MF->getFunction(), KernelCode); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { @@ -724,94 +727,91 @@ } } -void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const { +void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, + const SIProgramInfo &KernelInfo, + const MachineFunction &MF) const { const SIMachineFunctionInfo *MFI = MF.getInfo(); const SISubtarget &STM = MF.getSubtarget(); - amd_kernel_code_t header; - AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits()); - header.compute_pgm_resource_registers = + Out.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; - AMD_HSA_BITS_SET(header.code_properties, + AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, getElementByteSizeValue(STM.getMaxPrivateElementSize())); if (MFI->hasPrivateSegmentBuffer()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; } if (MFI->hasDispatchPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; if (MFI->hasQueuePtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (MFI->hasKernargSegmentPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; if (MFI->hasDispatchID()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; if (MFI->hasFlatScratchInit()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; // TODO: Private segment size if (MFI->hasGridWorkgroupCountX()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; } if (MFI->hasGridWorkgroupCountY()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; } if (MFI->hasGridWorkgroupCountZ()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; } if (MFI->hasDispatchPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; if (STM.debuggerSupported()) - header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; + Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; if (STM.isXNACKEnabled()) - header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; + Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; // FIXME: Should use getKernArgSize - header.kernarg_segment_byte_size = + Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); - header.wavefront_sgpr_count = KernelInfo.NumSGPR; - header.workitem_vgpr_count = KernelInfo.NumVGPR; - header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; - header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; - header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; - header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + Out.wavefront_sgpr_count = KernelInfo.NumSGPR; + Out.workitem_vgpr_count = KernelInfo.NumVGPR; + Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; + Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. - header.kernarg_segment_alignment = std::max((size_t)4, + Out.kernarg_segment_alignment = std::max((size_t)4, countTrailingZeros(MFI->getMaxKernArgAlign())); if (STM.debuggerEmitPrologue()) { - header.debug_wavefront_private_segment_offset_sgpr = + Out.debug_wavefront_private_segment_offset_sgpr = KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - header.debug_private_segment_buffer_sgpr = + Out.debug_private_segment_buffer_sgpr = KernelInfo.DebuggerPrivateSegmentBufferSGPR; } - - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - getTargetStreamer().EmitAMDKernelCodeT(header); } bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, Index: lib/Target/AMDGPU/AMDGPURuntimeMetadata.h =================================================================== --- lib/Target/AMDGPU/AMDGPURuntimeMetadata.h +++ lib/Target/AMDGPU/AMDGPURuntimeMetadata.h @@ -127,6 +127,43 @@ // Alignment of pointee type const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; + const char KernelStatsWorkitemPrivateSegmentByteSize[] = + "amd.Kernel.Stats.WorkitemPrivateSegmentByteSize"; + const char KernelStatsWorkgroupGroupSegmentByteSize[] = + "amd.Kernel.Stats.WorkgroupGroupSegmentByteSize"; + const char KernelStatsGdsSegmentByteSize[] = + "amd.Kernel.Stats.GdsSegmentByteSize"; + const char KernelStatsKernargSegmentByteSize[] = + "amd.Kernel.Stats.KernargSegmentByteSize"; + const char KernelStatsWorkgroupNumFbarriers[] = + "amd.Kernel.Stats.WorkgroupNumFbarriers"; + const char KernelStatsWavefrontNumSGPRs[] = + "amd.Kernel.Stats.WavefrontNumSGPRs"; + const char KernelStatsWorkitemNumVGPRs[] = + "amd.Kernel.Stats.WorkitemNumVGPRs"; + const char KernelStatsReservedFirstVGPR[] = + "amd.Kernel.Stats.ReservedFirstVGPR"; + const char KernelStatsReservedNumVGPRs[] = + "amd.Kernel.Stats.ReservedNumVGPRs"; + const char KernelStatsReservedFirstSGPR[] = + "amd.Kernel.Stats.ReservedFirstSGPR"; + const char KernelStatsReservedNumSGPRs[] = + "amd.Kernel.Stats.ReservedNumSGPRs"; + const char KernelStatsDebugWavefrontPrivateSegmentOffsetSGPR[] = + "amd.Kernel.Stats.DebugWavefrontPrivateSegmentOffsetSGPR"; + const char KernelStatsDebugPrivateSegmentBufferSGPR[] = + "amd.Kernel.Stats.DebugPrivateSegmentBufferSGPR"; + const char KernelStatsKernargSegmentAlignment[] = + "amd.Kernel.Stats.KernargSegmentAlignment"; + const char KernelStatsGroupSegmentAlignment[] = + "amd.Kernel.Stats.GroupSegmentAlignment"; + const char KernelStatsPrivateSegmentAlignment[] = + "amd.Kernel.Stats.PrivateSegmentAlignment"; + const char KernelStatsWavefrontSize[] = + "amd.Kernel.Stats.WavefrontSize"; + const char KernelStatsCallConvention[] = + "amd.Kernel.Stats.CallConvention"; + } // end namespace KeyName namespace KernelArg { @@ -223,6 +260,24 @@ uint32_t KernelIndex = INVALID_KERNEL_INDEX; uint8_t NoPartialWorkGroups = 0; std::vector Args; + uint32_t WorkitemPrivateSegmentByteSize = 0; + uint32_t WorkgroupGroupSegmentByteSize = 0; + uint32_t GdsSegmentByteSize = 0; + uint64_t KernargSegmentByteSize = 0; + uint32_t WorkgroupNumFbarriers = 0; + uint16_t WavefrontNumSGPRs = 0; + uint16_t WorkitemNumVGPRs = 0; + uint16_t ReservedFirstVGPR = 0; + uint16_t ReservedNumVGPRs = 0; + uint16_t ReservedFirstSGPR = 0; + uint16_t ReservedNumSGPRs = 0; + uint16_t DebugWavefrontPrivateSegmentOffsetSGPR = 0; + uint16_t DebugPrivateSegmentBufferSGPR = 0; + uint8_t KernargSegmentAlignment = 0; + uint8_t GroupSegmentAlignment = 0; + uint8_t PrivateSegmentAlignment = 0; + uint8_t WavefrontSize = 0; + int32_t CallConvention = 0; Metadata() = default; }; Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.h @@ -11,6 +11,7 @@ #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMETADATASTREAMER_H #include "AMDGPURuntimeMetadata.h" +#include "AMDKernelCodeT.h" #include "llvm/ADT/StringRef.h" #include #include @@ -57,6 +58,8 @@ void streamHighLevelKernelMetadata(const Function &Func); + void streamKernelStatisticsMetadata(const amd_kernel_code_t &KernelCode); + void streamKernelArgMetadata(const Argument &Arg); void streamKernelArgMetadata(const DataLayout &DL, Type *Ty, @@ -73,7 +76,8 @@ void streamEnd() {} - void streamKernelMetadata(const Function &Func); + void streamKernelMetadata(const Function &Func, + const amd_kernel_code_t &KernelCode); std::string toYamlString(); }; Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMetadataStreamer.cpp @@ -83,6 +83,61 @@ YamlIO.mapOptional( KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups, uint8_t(0)); YamlIO.mapRequired(KeyName::Args, K.Args); + + YamlIO.mapRequired( + KeyName::KernelStatsWorkitemPrivateSegmentByteSize, + K.WorkitemPrivateSegmentByteSize); + YamlIO.mapRequired( + KeyName::KernelStatsWorkgroupGroupSegmentByteSize, + K.WorkgroupGroupSegmentByteSize); + YamlIO.mapRequired( + KeyName::KernelStatsGdsSegmentByteSize, + K.GdsSegmentByteSize); + YamlIO.mapRequired( + KeyName::KernelStatsKernargSegmentByteSize, + K.KernargSegmentByteSize); + YamlIO.mapRequired( + KeyName::KernelStatsWorkgroupNumFbarriers, + K.WorkgroupNumFbarriers); + YamlIO.mapRequired( + KeyName::KernelStatsWavefrontNumSGPRs, + K.WavefrontNumSGPRs); + YamlIO.mapRequired( + KeyName::KernelStatsWorkitemNumVGPRs, + K.WorkitemNumVGPRs); + YamlIO.mapRequired( + KeyName::KernelStatsReservedFirstVGPR, + K.ReservedFirstVGPR); + YamlIO.mapRequired( + KeyName::KernelStatsReservedNumVGPRs, + K.ReservedNumVGPRs); + YamlIO.mapRequired( + KeyName::KernelStatsReservedFirstSGPR, + K.ReservedFirstSGPR); + YamlIO.mapRequired( + KeyName::KernelStatsReservedNumSGPRs, + K.ReservedNumSGPRs); + YamlIO.mapRequired( + KeyName::KernelStatsDebugWavefrontPrivateSegmentOffsetSGPR, + K.DebugWavefrontPrivateSegmentOffsetSGPR); + YamlIO.mapRequired( + KeyName::KernelStatsDebugPrivateSegmentBufferSGPR, + K.DebugPrivateSegmentBufferSGPR); + YamlIO.mapRequired( + KeyName::KernelStatsKernargSegmentAlignment, + K.KernargSegmentAlignment); + YamlIO.mapRequired( + KeyName::KernelStatsGroupSegmentAlignment, + K.GroupSegmentAlignment); + YamlIO.mapRequired( + KeyName::KernelStatsPrivateSegmentAlignment, + K.PrivateSegmentAlignment); + YamlIO.mapRequired( + KeyName::KernelStatsWavefrontSize, + K.WavefrontSize); + YamlIO.mapRequired( + KeyName::KernelStatsCallConvention, + K.CallConvention); } static const bool flow = true; }; @@ -332,6 +387,36 @@ } } +void Streamer::streamKernelStatisticsMetadata( + const amd_kernel_code_t &KernelCode) { + auto &K = Program.Kernels.back(); + + K.WorkitemPrivateSegmentByteSize = + KernelCode.workitem_private_segment_byte_size; + K.WorkgroupGroupSegmentByteSize = + KernelCode.workgroup_group_segment_byte_size; + K.GdsSegmentByteSize = + KernelCode.gds_segment_byte_size; + K.KernargSegmentByteSize = + KernelCode.kernarg_segment_byte_size; + K.WorkgroupNumFbarriers = KernelCode.workgroup_fbarrier_count; + K.WavefrontNumSGPRs = KernelCode.wavefront_sgpr_count; + K.WorkitemNumVGPRs = KernelCode.workitem_vgpr_count; + K.ReservedFirstVGPR = KernelCode.reserved_vgpr_first; + K.ReservedNumVGPRs = KernelCode.reserved_vgpr_count; + K.ReservedFirstSGPR = KernelCode.reserved_sgpr_first; + K.ReservedNumSGPRs = KernelCode.reserved_sgpr_count; + K.DebugWavefrontPrivateSegmentOffsetSGPR = + KernelCode.debug_wavefront_private_segment_offset_sgpr; + K.DebugPrivateSegmentBufferSGPR = + KernelCode.debug_private_segment_buffer_sgpr; + K.KernargSegmentAlignment = KernelCode.kernarg_segment_alignment; + K.GroupSegmentAlignment = KernelCode.group_segment_alignment; + K.PrivateSegmentAlignment = KernelCode.private_segment_alignment; + K.WavefrontSize = KernelCode.wavefront_size; + K.CallConvention = KernelCode.call_convention; +} + void Streamer::streamKernelArgMetadata(const Argument &Arg) { auto &Func = *Arg.getParent(); unsigned ArgNo = Arg.getArgNo(); @@ -444,7 +529,8 @@ streamPrintfInfoMetadata(Mod); } -void Streamer::streamKernelMetadata(const Function &Func) { +void Streamer::streamKernelMetadata(const Function &Func, + const amd_kernel_code_t &KernelCode) { if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL) return; Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -52,7 +52,8 @@ virtual void EmitStartOfRuntimeMetadata(const FeatureBitset &Features, const Module &Mod); - virtual void EmitKernelRuntimeMetadata(const Function &Func); + virtual void EmitKernelRuntimeMetadata(const Function &Func, + const amd_kernel_code_t &KernelCode); virtual void EmitEmitEndOfRuntimeMetadata(); Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -44,8 +44,10 @@ RuntimeMetadataStreamer.streamBegin(Features, Mod); } -void AMDGPUTargetStreamer::EmitKernelRuntimeMetadata(const Function &Func) { - RuntimeMetadataStreamer.streamKernelMetadata(Func); +void AMDGPUTargetStreamer::EmitKernelRuntimeMetadata( + const Function &Func, + const amd_kernel_code_t &KernelCode) { + RuntimeMetadataStreamer.streamKernelMetadata(Func, KernelCode); } void AMDGPUTargetStreamer::EmitEmitEndOfRuntimeMetadata() {