diff --git a/D80713.diff b/D80713.diff new file mode 100644 --- /dev/null +++ b/D80713.diff @@ -0,0 +1,848 @@ +diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h ++++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +@@ -162,39 +162,49 @@ + uint8_t reserved2[6]; + }; + ++enum : uint32_t { ++ GROUP_SEGMENT_FIXED_SIZE_OFFSET = 0, ++ PRIVATE_SEGMENT_FIXED_SIZE_OFFSET = 4, ++ RESERVED0_OFFSET = 8, ++ KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET = 16, ++ RESERVED1_OFFSET = 24, ++ COMPUTE_PGM_RSRC3_OFFSET = 44, ++ COMPUTE_PGM_RSRC1_OFFSET = 48, ++ COMPUTE_PGM_RSRC2_OFFSET = 52, ++ KERNEL_CODE_PROPERTIES_OFFSET = 56, ++ RESERVED2_OFFSET = 58, ++}; ++ + static_assert( + sizeof(kernel_descriptor_t) == 64, + "invalid size for kernel_descriptor_t"); +-static_assert( +- offsetof(kernel_descriptor_t, group_segment_fixed_size) == 0, +- "invalid offset for group_segment_fixed_size"); +-static_assert( +- offsetof(kernel_descriptor_t, private_segment_fixed_size) == 4, +- "invalid offset for private_segment_fixed_size"); +-static_assert( +- offsetof(kernel_descriptor_t, reserved0) == 8, +- "invalid offset for reserved0"); +-static_assert( +- offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == 16, +- "invalid offset for kernel_code_entry_byte_offset"); +-static_assert( +- offsetof(kernel_descriptor_t, reserved1) == 24, +- "invalid offset for reserved1"); +-static_assert( +- offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == 44, +- "invalid offset for compute_pgm_rsrc3"); +-static_assert( +- offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == 48, +- "invalid offset for compute_pgm_rsrc1"); +-static_assert( +- offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == 52, +- "invalid offset for compute_pgm_rsrc2"); +-static_assert( +- offsetof(kernel_descriptor_t, kernel_code_properties) == 56, +- "invalid offset for kernel_code_properties"); +-static_assert( +- offsetof(kernel_descriptor_t, reserved2) == 58, +- "invalid offset for reserved2"); ++static_assert(offsetof(kernel_descriptor_t, group_segment_fixed_size) == ++ GROUP_SEGMENT_FIXED_SIZE_OFFSET, ++ "invalid offset for group_segment_fixed_size"); ++static_assert(offsetof(kernel_descriptor_t, private_segment_fixed_size) == ++ PRIVATE_SEGMENT_FIXED_SIZE_OFFSET, ++ "invalid offset for private_segment_fixed_size"); ++static_assert(offsetof(kernel_descriptor_t, reserved0) == RESERVED0_OFFSET, ++ "invalid offset for reserved0"); ++static_assert(offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == ++ KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET, ++ "invalid offset for kernel_code_entry_byte_offset"); ++static_assert(offsetof(kernel_descriptor_t, reserved1) == RESERVED1_OFFSET, ++ "invalid offset for reserved1"); ++static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == ++ COMPUTE_PGM_RSRC3_OFFSET, ++ "invalid offset for compute_pgm_rsrc3"); ++static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == ++ COMPUTE_PGM_RSRC1_OFFSET, ++ "invalid offset for compute_pgm_rsrc1"); ++static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == ++ COMPUTE_PGM_RSRC2_OFFSET, ++ "invalid offset for compute_pgm_rsrc2"); ++static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) == ++ KERNEL_CODE_PROPERTIES_OFFSET, ++ "invalid offset for kernel_code_properties"); ++static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET, ++ "invalid offset for reserved2"); + + } // end namespace amdhsa + } // end namespace llvm +diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h ++++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +@@ -17,10 +17,11 @@ + + #include "llvm/ADT/ArrayRef.h" + #include "llvm/MC/MCContext.h" +-#include "llvm/MC/MCInstrInfo.h" + #include "llvm/MC/MCDisassembler/MCDisassembler.h" + #include "llvm/MC/MCDisassembler/MCRelocationInfo.h" + #include "llvm/MC/MCDisassembler/MCSymbolizer.h" ++#include "llvm/MC/MCInstrInfo.h" ++#include "llvm/Support/DataExtractor.h" + + #include + #include +@@ -66,6 +67,33 @@ + DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst, + uint64_t Address) const; + ++ Optional onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, ++ ArrayRef Bytes, ++ uint64_t Address, ++ raw_ostream &CStream) const override; ++ ++ DecodeStatus decodeKernelDescriptor(StringRef KdName, ArrayRef Bytes, ++ uint64_t KdAddress) const; ++ ++ DecodeStatus ++ decodeKernelDescriptorDirective(DataExtractor::Cursor &Cursor, ++ ArrayRef Bytes, ++ raw_string_ostream &KdStream) const; ++ ++ /// Decode as directives that handle COMPUTE_PGM_RSRC1. ++ /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC1. ++ /// \param KdStream - Stream to write the disassembled directives to. ++ // NOLINTNEXTLINE(readability-identifier-naming) ++ DecodeStatus decodeCOMPUTE_PGM_RSRC1(uint32_t FourByteBuffer, ++ raw_string_ostream &KdStream) const; ++ ++ /// Decode as directives that handle COMPUTE_PGM_RSRC2. ++ /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC2. ++ /// \param KdStream - Stream to write the disassembled directives to. ++ // NOLINTNEXTLINE(readability-identifier-naming) ++ DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer, ++ raw_string_ostream &KdStream) const; ++ + DecodeStatus convertSDWAInst(MCInst &MI) const; + DecodeStatus convertDPP8Inst(MCInst &MI) const; + DecodeStatus convertMIMGInst(MCInst &MI) const; +diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp ++++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +@@ -34,6 +34,7 @@ + #include "llvm/MC/MCFixedLenDisassembler.h" + #include "llvm/MC/MCInst.h" + #include "llvm/MC/MCSubtargetInfo.h" ++#include "llvm/Support/AMDHSAKernelDescriptor.h" + #include "llvm/Support/Endian.h" + #include "llvm/Support/ErrorHandling.h" + #include "llvm/Support/MathExtras.h" +@@ -1215,6 +1216,350 @@ + return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; + } + ++//===----------------------------------------------------------------------===// ++// AMDGPU specific symbol handling ++//===----------------------------------------------------------------------===// ++#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \ ++ do { \ ++ KdStream << Indent << DIRECTIVE " " \ ++ << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \ ++ } while (0) ++ ++// NOLINTNEXTLINE(readability-identifier-naming) ++MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( ++ uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { ++ using namespace amdhsa; ++ StringRef Indent = "\t"; ++ ++ // We cannot accurately backward compute #VGPRs used from ++ // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same ++ // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we ++ // simply calculate the inverse of what the assembler does. ++ ++ uint32_t GranulatedWorkitemVGPRCount = ++ (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >> ++ COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT; ++ ++ uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) * ++ AMDGPU::IsaInfo::getVGPREncodingGranule(&STI); ++ ++ KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n'; ++ ++ // We cannot backward compute values used to calculate ++ // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following ++ // directives can't be computed: ++ // .amdhsa_reserve_vcc ++ // .amdhsa_reserve_flat_scratch ++ // .amdhsa_reserve_xnack_mask ++ // They take their respective default values if not specified in the assembly. ++ // ++ // GRANULATED_WAVEFRONT_SGPR_COUNT ++ // = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK) ++ // ++ // We compute the inverse as though all directives apart from NEXT_FREE_SGPR ++ // are set to 0. So while disassembling we consider that: ++ // ++ // GRANULATED_WAVEFRONT_SGPR_COUNT ++ // = f(NEXT_FREE_SGPR + 0 + 0 + 0) ++ // ++ // The disassembler cannot recover the original values of those 3 directives. ++ ++ uint32_t GranulatedWavefrontSGPRCount = ++ (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >> ++ COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT; ++ ++ if (isGFX10() && GranulatedWavefrontSGPRCount) ++ return MCDisassembler::Fail; ++ ++ uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) * ++ AMDGPU::IsaInfo::getSGPREncodingGranule(&STI); ++ ++ KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n'; ++ KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; ++ KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n'; ++ KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n"; ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY) ++ return MCDisassembler::Fail; ++ ++ PRINT_DIRECTIVE(".amdhsa_float_round_mode_32", ++ COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32); ++ PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64", ++ COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64); ++ PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32", ++ COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32); ++ PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64", ++ COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64); ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV) ++ return MCDisassembler::Fail; ++ ++ PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP); ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE) ++ return MCDisassembler::Fail; ++ ++ PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE); ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY) ++ return MCDisassembler::Fail; ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER) ++ return MCDisassembler::Fail; ++ ++ PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL); ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0) ++ return MCDisassembler::Fail; ++ ++ if (isGFX10()) { ++ PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode", ++ COMPUTE_PGM_RSRC1_WGP_MODE); ++ PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED); ++ PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS); ++ } ++ return MCDisassembler::Success; ++} ++ ++// NOLINTNEXTLINE(readability-identifier-naming) ++MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2( ++ uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { ++ using namespace amdhsa; ++ StringRef Indent = "\t"; ++ PRINT_DIRECTIVE( ++ ".amdhsa_system_sgpr_private_segment_wavefront_offset", ++ COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET); ++ PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x", ++ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); ++ PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y", ++ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y); ++ PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z", ++ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z); ++ PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info", ++ COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO); ++ PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id", ++ COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID); ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH) ++ return MCDisassembler::Fail; ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY) ++ return MCDisassembler::Fail; ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE) ++ return MCDisassembler::Fail; ++ ++ PRINT_DIRECTIVE( ++ ".amdhsa_exception_fp_ieee_invalid_op", ++ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION); ++ PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src", ++ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE); ++ PRINT_DIRECTIVE( ++ ".amdhsa_exception_fp_ieee_div_zero", ++ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO); ++ PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow", ++ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW); ++ PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow", ++ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW); ++ PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact", ++ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT); ++ PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero", ++ COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO); ++ ++ if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0) ++ return MCDisassembler::Fail; ++ ++ return MCDisassembler::Success; ++} ++ ++#undef PRINT_DIRECTIVE ++ ++MCDisassembler::DecodeStatus ++AMDGPUDisassembler::decodeKernelDescriptorDirective( ++ DataExtractor::Cursor &Cursor, ArrayRef Bytes, ++ raw_string_ostream &KdStream) const { ++#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \ ++ do { \ ++ KdStream << Indent << DIRECTIVE " " \ ++ << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \ ++ } while (0) ++ ++ uint16_t TwoByteBuffer = 0; ++ uint32_t FourByteBuffer = 0; ++ uint64_t EightByteBuffer = 0; ++ ++ StringRef ReservedBytes; ++ StringRef Indent = "\t"; ++ ++ assert(Bytes.size() == 64); ++ DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8); ++ ++ switch (Cursor.tell()) { ++ case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET: ++ FourByteBuffer = DE.getU32(Cursor); ++ KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer ++ << '\n'; ++ return MCDisassembler::Success; ++ ++ case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET: ++ FourByteBuffer = DE.getU32(Cursor); ++ KdStream << Indent << ".amdhsa_private_segment_fixed_size " ++ << FourByteBuffer << '\n'; ++ return MCDisassembler::Success; ++ ++ case amdhsa::RESERVED0_OFFSET: ++ // 8 reserved bytes, must be 0. ++ EightByteBuffer = DE.getU64(Cursor); ++ if (EightByteBuffer) { ++ return MCDisassembler::Fail; ++ } ++ return MCDisassembler::Success; ++ ++ case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET: ++ // KERNEL_CODE_ENTRY_BYTE_OFFSET ++ // So far no directive controls this for Code Object V3, so simply skip for ++ // disassembly. ++ DE.skip(Cursor, 8); ++ return MCDisassembler::Success; ++ ++ case amdhsa::RESERVED1_OFFSET: ++ // 20 reserved bytes, must be 0. ++ ReservedBytes = DE.getBytes(Cursor, 20); ++ for (int I = 0; I < 20; ++I) { ++ if (ReservedBytes[I] != 0) { ++ return MCDisassembler::Fail; ++ } ++ } ++ return MCDisassembler::Success; ++ ++ case amdhsa::COMPUTE_PGM_RSRC3_OFFSET: ++ // COMPUTE_PGM_RSRC3 ++ // - Only set for GFX10, GFX6-9 have this to be 0. ++ // - Currently no directives directly control this. ++ FourByteBuffer = DE.getU32(Cursor); ++ if (!isGFX10() && FourByteBuffer) { ++ return MCDisassembler::Fail; ++ } ++ return MCDisassembler::Success; ++ ++ case amdhsa::COMPUTE_PGM_RSRC1_OFFSET: ++ FourByteBuffer = DE.getU32(Cursor); ++ if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) == ++ MCDisassembler::Fail) { ++ return MCDisassembler::Fail; ++ } ++ return MCDisassembler::Success; ++ ++ case amdhsa::COMPUTE_PGM_RSRC2_OFFSET: ++ FourByteBuffer = DE.getU32(Cursor); ++ if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) == ++ MCDisassembler::Fail) { ++ return MCDisassembler::Fail; ++ } ++ return MCDisassembler::Success; ++ ++ case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET: ++ using namespace amdhsa; ++ TwoByteBuffer = DE.getU16(Cursor); ++ ++ PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer", ++ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); ++ PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr", ++ KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); ++ PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr", ++ KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR); ++ PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr", ++ KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); ++ PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id", ++ KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); ++ PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init", ++ KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); ++ PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", ++ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); ++ ++ if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) ++ return MCDisassembler::Fail; ++ ++ // Reserved for GFX9 ++ if (isGFX9() && ++ (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) { ++ return MCDisassembler::Fail; ++ } else if (isGFX10()) { ++ PRINT_DIRECTIVE(".amdhsa_wavefront_size32", ++ KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); ++ } ++ ++ if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1) ++ return MCDisassembler::Fail; ++ ++ return MCDisassembler::Success; ++ ++ case amdhsa::RESERVED2_OFFSET: ++ // 6 bytes from here are reserved, must be 0. ++ ReservedBytes = DE.getBytes(Cursor, 6); ++ for (int I = 0; I < 6; ++I) { ++ if (ReservedBytes[I] != 0) ++ return MCDisassembler::Fail; ++ } ++ return MCDisassembler::Success; ++ ++ default: ++ llvm_unreachable("Unhandled index. Case statements cover everything."); ++ return MCDisassembler::Fail; ++ } ++#undef PRINT_DIRECTIVE ++} ++ ++MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor( ++ StringRef KdName, ArrayRef Bytes, uint64_t KdAddress) const { ++ // CP microcode requires the kernel descriptor to be 64 aligned. ++ if (Bytes.size() != 64 || KdAddress % 64 != 0) ++ return MCDisassembler::Fail; ++ ++ std::string Kd; ++ raw_string_ostream KdStream(Kd); ++ KdStream << ".amdhsa_kernel " << KdName << '\n'; ++ ++ DataExtractor::Cursor C(0); ++ while (C && C.tell() < Bytes.size()) { ++ MCDisassembler::DecodeStatus Status = ++ decodeKernelDescriptorDirective(C, Bytes, KdStream); ++ ++ cantFail(C.takeError()); ++ ++ if (Status == MCDisassembler::Fail) ++ return MCDisassembler::Fail; ++ } ++ KdStream << ".end_amdhsa_kernel\n"; ++ outs() << KdStream.str(); ++ return MCDisassembler::Success; ++} ++ ++Optional ++AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, ++ ArrayRef Bytes, uint64_t Address, ++ raw_ostream &CStream) const { ++ // Right now only kernel descriptor needs to be handled. ++ // We ignore all other symbols for target specific handling. ++ // TODO: ++ // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code ++ // Object V2 and V3 when symbols are marked protected. ++ ++ // amd_kernel_code_t for Code Object V2. ++ if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) { ++ Size = 256; ++ return MCDisassembler::Fail; ++ } ++ ++ // Code Object V3 kernel descriptors. ++ StringRef Name = Symbol.Name; ++ if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) { ++ Size = 64; // Size = 64 regardless of success or failure. ++ return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address); ++ } ++ return None; ++} ++ + //===----------------------------------------------------------------------===// + // AMDGPUSymbolizer + //===----------------------------------------------------------------------===// +diff --git a/llvm/test/CodeGen/AMDGPU/nop-data.ll b/llvm/test/CodeGen/AMDGPU/nop-data.ll +--- a/llvm/test/CodeGen/AMDGPU/nop-data.ll ++++ b/llvm/test/CodeGen/AMDGPU/nop-data.ll +@@ -1,7 +1,7 @@ + ; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - --mcpu=fiji | FileCheck %s + + ; CHECK: : +-; CHECK-NEXT: s_endpgm ++; CHECK: s_endpgm + define amdgpu_kernel void @kernel0() align 256 { + entry: + ret void +@@ -80,7 +80,7 @@ + + ; CHECK-EMPTY: + ; CHECK-NEXT: : +-; CHECK-NEXT: s_endpgm ++; CHECK: s_endpgm + define amdgpu_kernel void @kernel1(i32 addrspace(1)* addrspace(4)* %ptr.out) align 256 { + entry: + ret void +diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-failure.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-failure.s +new file mode 100644 +--- /dev/null ++++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-failure.s +@@ -0,0 +1,37 @@ ++;; Failure test. We create a malformed kernel descriptor (KD) by manually ++;; setting the bytes, because one can't create a malformed KD using the ++;; assembler directives. ++ ++; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t.o ++ ++; RUN: printf ".type my_kernel.kd, @object \nmy_kernel.kd:\n.size my_kernel.kd, 64\n" > %t1.sym_info ++; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t.o \ ++; RUN: | tail -n +9 > %t1.sym_content ++; RUN: cat %t1.sym_info %t1.sym_content > %t1.s ++ ++; RUN: llvm-mc %t1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t-re-assemble.o ++; RUN: diff %t.o %t-re-assemble.o ++ ++;; Test failure by setting one of the reserved bytes to non-zero value. ++ ++.type my_kernel.kd, @object ++.size my_kernel.kd, 64 ++my_kernel.kd: ++ .long 0x00000000 ;; group_segment_fixed_size ++ .long 0x00000000 ;; private_segment_fixed_size ++ .quad 0x00FF000000000000 ;; reserved bytes. ++ .quad 0x0000000000000000 ;; kernel_code_entry_byte_offset, any value works. ++ ++ ;; 20 reserved bytes. ++ .quad 0x0000000000000000 ++ .quad 0x0000000000000000 ++ .long 0x00000000 ++ ++ .long 0x00000000 ;; compute_PGM_RSRC3 ++ .long 0x00000000 ;; compute_PGM_RSRC1 ++ .long 0x00000000 ;; compute_PGM_RSRC2 ++ .short 0x0000 ;; additional fields. ++ ++ ;; 6 reserved bytes. ++ .long 0x0000000 ++ .short 0x0000 +diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s +new file mode 100644 +--- /dev/null ++++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s +@@ -0,0 +1,49 @@ ++;; Test disassembly for GRANULATED_WAVEFRONT_SGPR_COUNT in the kernel descriptor. ++ ++; RUN: split-file %s %t.dir ++ ++; RUN: llvm-mc %t.dir/1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1 ++; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +8 \ ++; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1-re-assemble ++; RUN: diff %t1 %t1-re-assemble ++ ++; RUN: llvm-mc %t.dir/2.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2 ++; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +8 \ ++; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2-re-assemble ++; RUN: diff %t2 %t2-re-assemble ++ ++; RUN: llvm-mc %t.dir/3.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3 ++; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +8 \ ++; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3-re-assemble ++; RUN: diff %t3 %t3-re-assemble ++ ++ ++;--- 1.s ++;; Only set next_free_sgpr. ++.amdhsa_kernel my_kernel_1 ++ .amdhsa_next_free_vgpr 0 ++ .amdhsa_next_free_sgpr 42 ++ .amdhsa_reserve_flat_scratch 0 ++ .amdhsa_reserve_xnack_mask 0 ++ .amdhsa_reserve_vcc 0 ++.end_amdhsa_kernel ++ ++;--- 2.s ++;; Only set other directives. ++.amdhsa_kernel my_kernel_2 ++ .amdhsa_next_free_vgpr 0 ++ .amdhsa_next_free_sgpr 0 ++ .amdhsa_reserve_flat_scratch 1 ++ .amdhsa_reserve_xnack_mask 1 ++ .amdhsa_reserve_vcc 1 ++.end_amdhsa_kernel ++ ++;--- 3.s ++;; Set all affecting directives. ++.amdhsa_kernel my_kernel_3 ++ .amdhsa_next_free_vgpr 0 ++ .amdhsa_next_free_sgpr 35 ++ .amdhsa_reserve_flat_scratch 1 ++ .amdhsa_reserve_xnack_mask 1 ++ .amdhsa_reserve_vcc 1 ++.end_amdhsa_kernel +diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s +new file mode 100644 +--- /dev/null ++++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s +@@ -0,0 +1,36 @@ ++;; Test disassembly for GRANULATED_WORKITEM_VGPR_COUNT in the kernel descriptor. ++ ++; RUN: split-file %s %t.dir ++ ++; RUN: llvm-mc %t.dir/1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1 ++; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +8 \ ++; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1-re-assemble ++; RUN: diff %t1 %t1-re-assemble ++ ++; RUN: llvm-mc %t.dir/2.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2 ++; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +8 \ ++; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2-re-assemble ++; RUN: diff %t2 %t2-re-assemble ++ ++; RUN: llvm-mc %t.dir/3.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3 ++; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +8 \ ++; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3-re-assemble ++; RUN: diff %t3 %t3-re-assemble ++ ++;--- 1.s ++.amdhsa_kernel my_kernel_1 ++ .amdhsa_next_free_vgpr 23 ++ .amdhsa_next_free_sgpr 0 ++.end_amdhsa_kernel ++ ++;--- 2.s ++.amdhsa_kernel my_kernel_2 ++ .amdhsa_next_free_vgpr 14 ++ .amdhsa_next_free_sgpr 0 ++.end_amdhsa_kernel ++ ++;--- 3.s ++.amdhsa_kernel my_kernel_3 ++ .amdhsa_next_free_vgpr 32 ++ .amdhsa_next_free_sgpr 0 ++.end_amdhsa_kernel +diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s +new file mode 100644 +--- /dev/null ++++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s +@@ -0,0 +1,58 @@ ++;; Entirely zeroed kernel descriptor (for GFX10). ++ ++; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj -o %t ++; RUN: llvm-objdump -s -j .text %t | FileCheck --check-prefix=OBJDUMP %s ++ ++;; TODO: ++;; This file and kd-zeroed-raw.s should produce the same output for the kernel ++;; descriptor - a block of 64 zeroed bytes. But looks like the assembler sets ++;; the FWD_PROGRESS bit in COMPUTE_PGM_RSRC1 to 1 even when the directive ++;; mentions 0 (see line 36). ++ ++;; Check the raw bytes right now. ++ ++; OBJDUMP: 0000 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0030 01000000 00000000 00000000 00000000 ++ ++.amdhsa_kernel my_kernel ++ .amdhsa_group_segment_fixed_size 0 ++ .amdhsa_private_segment_fixed_size 0 ++ .amdhsa_next_free_vgpr 8 ++ .amdhsa_reserve_vcc 0 ++ .amdhsa_reserve_flat_scratch 0 ++ .amdhsa_reserve_xnack_mask 0 ++ .amdhsa_next_free_sgpr 8 ++ .amdhsa_float_round_mode_32 0 ++ .amdhsa_float_round_mode_16_64 0 ++ .amdhsa_float_denorm_mode_32 0 ++ .amdhsa_float_denorm_mode_16_64 0 ++ .amdhsa_dx10_clamp 0 ++ .amdhsa_ieee_mode 0 ++ .amdhsa_fp16_overflow 0 ++ .amdhsa_workgroup_processor_mode 0 ++ .amdhsa_memory_ordered 0 ++ .amdhsa_forward_progress 0 ++ .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ++ .amdhsa_system_sgpr_workgroup_id_x 0 ++ .amdhsa_system_sgpr_workgroup_id_y 0 ++ .amdhsa_system_sgpr_workgroup_id_z 0 ++ .amdhsa_system_sgpr_workgroup_info 0 ++ .amdhsa_system_vgpr_workitem_id 0 ++ .amdhsa_exception_fp_ieee_invalid_op 0 ++ .amdhsa_exception_fp_denorm_src 0 ++ .amdhsa_exception_fp_ieee_div_zero 0 ++ .amdhsa_exception_fp_ieee_overflow 0 ++ .amdhsa_exception_fp_ieee_underflow 0 ++ .amdhsa_exception_fp_ieee_inexact 0 ++ .amdhsa_exception_int_div_zero 0 ++ .amdhsa_user_sgpr_private_segment_buffer 0 ++ .amdhsa_user_sgpr_dispatch_ptr 0 ++ .amdhsa_user_sgpr_queue_ptr 0 ++ .amdhsa_user_sgpr_kernarg_segment_ptr 0 ++ .amdhsa_user_sgpr_dispatch_id 0 ++ .amdhsa_user_sgpr_flat_scratch_init 0 ++ .amdhsa_user_sgpr_private_segment_size 0 ++ .amdhsa_wavefront_size32 0 ++.end_amdhsa_kernel +diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx9.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx9.s +new file mode 100644 +--- /dev/null ++++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx9.s +@@ -0,0 +1,53 @@ ++;; Entirely zeroed kernel descriptor (for GFX9). ++ ++; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1 ++; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t1 \ ++; RUN: | tail -n +8 | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2 ++; RUN: diff %t1 %t2 ++ ++; RUN: llvm-objdump -s -j .text %t1 | FileCheck --check-prefix=OBJDUMP %s ++ ++; OBJDUMP: 0000 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0030 00000000 00000000 00000000 00000000 ++ ++;; This file and kd-zeroed-raw.s produce the same output for the kernel ++;; descriptor - a block of 64 zeroed bytes. ++ ++.amdhsa_kernel my_kernel ++ .amdhsa_group_segment_fixed_size 0 ++ .amdhsa_private_segment_fixed_size 0 ++ .amdhsa_next_free_vgpr 0 ++ .amdhsa_reserve_vcc 0 ++ .amdhsa_reserve_flat_scratch 0 ++ .amdhsa_reserve_xnack_mask 0 ++ .amdhsa_next_free_sgpr 0 ++ .amdhsa_float_round_mode_32 0 ++ .amdhsa_float_round_mode_16_64 0 ++ .amdhsa_float_denorm_mode_32 0 ++ .amdhsa_float_denorm_mode_16_64 0 ++ .amdhsa_dx10_clamp 0 ++ .amdhsa_ieee_mode 0 ++ .amdhsa_fp16_overflow 0 ++ .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ++ .amdhsa_system_sgpr_workgroup_id_x 0 ++ .amdhsa_system_sgpr_workgroup_id_y 0 ++ .amdhsa_system_sgpr_workgroup_id_z 0 ++ .amdhsa_system_sgpr_workgroup_info 0 ++ .amdhsa_system_vgpr_workitem_id 0 ++ .amdhsa_exception_fp_ieee_invalid_op 0 ++ .amdhsa_exception_fp_denorm_src 0 ++ .amdhsa_exception_fp_ieee_div_zero 0 ++ .amdhsa_exception_fp_ieee_overflow 0 ++ .amdhsa_exception_fp_ieee_underflow 0 ++ .amdhsa_exception_fp_ieee_inexact 0 ++ .amdhsa_exception_int_div_zero 0 ++ .amdhsa_user_sgpr_private_segment_buffer 0 ++ .amdhsa_user_sgpr_dispatch_ptr 0 ++ .amdhsa_user_sgpr_queue_ptr 0 ++ .amdhsa_user_sgpr_kernarg_segment_ptr 0 ++ .amdhsa_user_sgpr_dispatch_id 0 ++ .amdhsa_user_sgpr_flat_scratch_init 0 ++ .amdhsa_user_sgpr_private_segment_size 0 ++.end_amdhsa_kernel +diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-raw.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-raw.s +new file mode 100644 +--- /dev/null ++++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-raw.s +@@ -0,0 +1,41 @@ ++; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1 ++; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t1 \ ++; RUN: | tail -n +8 | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2 ++; RUN: llvm-objdump -s -j .text %t2 | FileCheck --check-prefix=OBJDUMP %s ++ ++;; Not running lit-test over gfx10 (see kd-zeroed-gfx10.s for details). ++;; kd-zeroed-raw.s and kd-zeroed-*.s should produce the same output for the ++;; kernel descriptor - a block of 64 zeroed bytes. ++ ++;; The disassembly will produce the contents of kd-zeroed-*.s which on being ++;; assembled contains additional relocation info. A diff over the entire object ++;; will fail in this case. So we check by looking the bytes in .text. ++ ++; OBJDUMP: 0000 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ++; OBJDUMP-NEXT: 0030 00000000 00000000 00000000 00000000 ++ ++;; The entire object is zeroed out. ++ ++.type my_kernel.kd, @object ++.size my_kernel.kd, 64 ++my_kernel.kd: ++ .long 0x00000000 ;; group_segment_fixed_size ++ .long 0x00000000 ;; private_segment_fixed_size ++ .quad 0x0000000000000000 ;; reserved bytes. ++ .quad 0x0000000000000000 ;; kernel_code_entry_byte_offset, any value works. ++ ++ ;; 20 reserved bytes. ++ .quad 0x0000000000000000 ++ .quad 0x0000000000000000 ++ .long 0x00000000 ++ ++ .long 0x00000000 ;; compute_PGM_RSRC3 ++ .long 0x00000000 ;; compute_PGM_RSRC1 ++ .long 0x00000000 ;; compute_PGM_RSRC2 ++ .short 0x0000 ;; additional fields. ++ ++ ;; 6 reserved bytes. ++ .long 0x0000000 ++ .short 0x0000 +diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp +--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp ++++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp +@@ -1854,23 +1854,6 @@ + outs() << SectionName << ":\n"; + } + +- if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) { +- if (Symbols[SI].Type == ELF::STT_AMDGPU_HSA_KERNEL) { +- // skip amd_kernel_code_t at the begining of kernel symbol (256 bytes) +- Start += 256; +- } +- if (SI == SE - 1 || +- Symbols[SI + 1].Type == ELF::STT_AMDGPU_HSA_KERNEL) { +- // cut trailing zeroes at the end of kernel +- // cut up to 256 bytes +- const uint64_t EndAlign = 256; +- const auto Limit = End - (std::min)(EndAlign, End - Start); +- while (End > Limit && +- *reinterpret_cast(&Bytes[End - 4]) == 0) +- End -= 4; +- } +- } +- + outs() << '\n'; + if (!NoLeadingAddr) + outs() << format(Is64Bits ? "%016" PRIx64 " " : "%08" PRIx64 " ", + diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -162,39 +162,49 @@ uint8_t reserved2[6]; }; +enum : uint32_t { + GROUP_SEGMENT_FIXED_SIZE_OFFSET = 0, + PRIVATE_SEGMENT_FIXED_SIZE_OFFSET = 4, + RESERVED0_OFFSET = 8, + KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET = 16, + RESERVED1_OFFSET = 24, + COMPUTE_PGM_RSRC3_OFFSET = 44, + COMPUTE_PGM_RSRC1_OFFSET = 48, + COMPUTE_PGM_RSRC2_OFFSET = 52, + KERNEL_CODE_PROPERTIES_OFFSET = 56, + RESERVED2_OFFSET = 58, +}; + static_assert( sizeof(kernel_descriptor_t) == 64, "invalid size for kernel_descriptor_t"); -static_assert( - offsetof(kernel_descriptor_t, group_segment_fixed_size) == 0, - "invalid offset for group_segment_fixed_size"); -static_assert( - offsetof(kernel_descriptor_t, private_segment_fixed_size) == 4, - "invalid offset for private_segment_fixed_size"); -static_assert( - offsetof(kernel_descriptor_t, reserved0) == 8, - "invalid offset for reserved0"); -static_assert( - offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == 16, - "invalid offset for kernel_code_entry_byte_offset"); -static_assert( - offsetof(kernel_descriptor_t, reserved1) == 24, - "invalid offset for reserved1"); -static_assert( - offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == 44, - "invalid offset for compute_pgm_rsrc3"); -static_assert( - offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == 48, - "invalid offset for compute_pgm_rsrc1"); -static_assert( - offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == 52, - "invalid offset for compute_pgm_rsrc2"); -static_assert( - offsetof(kernel_descriptor_t, kernel_code_properties) == 56, - "invalid offset for kernel_code_properties"); -static_assert( - offsetof(kernel_descriptor_t, reserved2) == 58, - "invalid offset for reserved2"); +static_assert(offsetof(kernel_descriptor_t, group_segment_fixed_size) == + GROUP_SEGMENT_FIXED_SIZE_OFFSET, + "invalid offset for group_segment_fixed_size"); +static_assert(offsetof(kernel_descriptor_t, private_segment_fixed_size) == + PRIVATE_SEGMENT_FIXED_SIZE_OFFSET, + "invalid offset for private_segment_fixed_size"); +static_assert(offsetof(kernel_descriptor_t, reserved0) == RESERVED0_OFFSET, + "invalid offset for reserved0"); +static_assert(offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == + KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET, + "invalid offset for kernel_code_entry_byte_offset"); +static_assert(offsetof(kernel_descriptor_t, reserved1) == RESERVED1_OFFSET, + "invalid offset for reserved1"); +static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == + COMPUTE_PGM_RSRC3_OFFSET, + "invalid offset for compute_pgm_rsrc3"); +static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == + COMPUTE_PGM_RSRC1_OFFSET, + "invalid offset for compute_pgm_rsrc1"); +static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == + COMPUTE_PGM_RSRC2_OFFSET, + "invalid offset for compute_pgm_rsrc2"); +static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) == + KERNEL_CODE_PROPERTIES_OFFSET, + "invalid offset for kernel_code_properties"); +static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET, + "invalid offset for reserved2"); } // end namespace amdhsa } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -17,10 +17,11 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCDisassembler/MCRelocationInfo.h" #include "llvm/MC/MCDisassembler/MCSymbolizer.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/DataExtractor.h" #include #include @@ -66,6 +67,33 @@ DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst, uint64_t Address) const; + Optional onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, + ArrayRef Bytes, + uint64_t Address, + raw_ostream &CStream) const override; + + DecodeStatus decodeKernelDescriptor(StringRef KdName, ArrayRef Bytes, + uint64_t KdAddress) const; + + DecodeStatus + decodeKernelDescriptorDirective(DataExtractor::Cursor &Cursor, + ArrayRef Bytes, + raw_string_ostream &KdStream) const; + + /// Decode as directives that handle COMPUTE_PGM_RSRC1. + /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC1. + /// \param KdStream - Stream to write the disassembled directives to. + // NOLINTNEXTLINE(readability-identifier-naming) + DecodeStatus decodeCOMPUTE_PGM_RSRC1(uint32_t FourByteBuffer, + raw_string_ostream &KdStream) const; + + /// Decode as directives that handle COMPUTE_PGM_RSRC2. + /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC2. + /// \param KdStream - Stream to write the disassembled directives to. + // NOLINTNEXTLINE(readability-identifier-naming) + DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer, + raw_string_ostream &KdStream) const; + DecodeStatus convertSDWAInst(MCInst &MI) const; DecodeStatus convertDPP8Inst(MCInst &MI) const; DecodeStatus convertMIMGInst(MCInst &MI) const; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -34,6 +34,7 @@ #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -1225,6 +1226,350 @@ return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; } +//===----------------------------------------------------------------------===// +// AMDGPU specific symbol handling +//===----------------------------------------------------------------------===// +#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \ + do { \ + KdStream << Indent << DIRECTIVE " " \ + << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \ + } while (0) + +// NOLINTNEXTLINE(readability-identifier-naming) +MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( + uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { + using namespace amdhsa; + StringRef Indent = "\t"; + + // We cannot accurately backward compute #VGPRs used from + // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same + // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we + // simply calculate the inverse of what the assembler does. + + uint32_t GranulatedWorkitemVGPRCount = + (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >> + COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT; + + uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) * + AMDGPU::IsaInfo::getVGPREncodingGranule(&STI); + + KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n'; + + // We cannot backward compute values used to calculate + // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following + // directives can't be computed: + // .amdhsa_reserve_vcc + // .amdhsa_reserve_flat_scratch + // .amdhsa_reserve_xnack_mask + // They take their respective default values if not specified in the assembly. + // + // GRANULATED_WAVEFRONT_SGPR_COUNT + // = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK) + // + // We compute the inverse as though all directives apart from NEXT_FREE_SGPR + // are set to 0. So while disassembling we consider that: + // + // GRANULATED_WAVEFRONT_SGPR_COUNT + // = f(NEXT_FREE_SGPR + 0 + 0 + 0) + // + // The disassembler cannot recover the original values of those 3 directives. + + uint32_t GranulatedWavefrontSGPRCount = + (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >> + COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT; + + if (isGFX10() && GranulatedWavefrontSGPRCount) + return MCDisassembler::Fail; + + uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) * + AMDGPU::IsaInfo::getSGPREncodingGranule(&STI); + + KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n'; + KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; + KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n'; + KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n"; + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY) + return MCDisassembler::Fail; + + PRINT_DIRECTIVE(".amdhsa_float_round_mode_32", + COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32); + PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64", + COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64); + PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32", + COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32); + PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64", + COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64); + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV) + return MCDisassembler::Fail; + + PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP); + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE) + return MCDisassembler::Fail; + + PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE); + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY) + return MCDisassembler::Fail; + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER) + return MCDisassembler::Fail; + + PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL); + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0) + return MCDisassembler::Fail; + + if (isGFX10()) { + PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode", + COMPUTE_PGM_RSRC1_WGP_MODE); + PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED); + PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS); + } + return MCDisassembler::Success; +} + +// NOLINTNEXTLINE(readability-identifier-naming) +MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2( + uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { + using namespace amdhsa; + StringRef Indent = "\t"; + PRINT_DIRECTIVE( + ".amdhsa_system_sgpr_private_segment_wavefront_offset", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET); + PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); + PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y); + PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z); + PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO); + PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id", + COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID); + + if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH) + return MCDisassembler::Fail; + + if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY) + return MCDisassembler::Fail; + + if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE) + return MCDisassembler::Fail; + + PRINT_DIRECTIVE( + ".amdhsa_exception_fp_ieee_invalid_op", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION); + PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE); + PRINT_DIRECTIVE( + ".amdhsa_exception_fp_ieee_div_zero", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO); + PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW); + PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW); + PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT); + PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO); + + if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0) + return MCDisassembler::Fail; + + return MCDisassembler::Success; +} + +#undef PRINT_DIRECTIVE + +MCDisassembler::DecodeStatus +AMDGPUDisassembler::decodeKernelDescriptorDirective( + DataExtractor::Cursor &Cursor, ArrayRef Bytes, + raw_string_ostream &KdStream) const { +#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \ + do { \ + KdStream << Indent << DIRECTIVE " " \ + << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \ + } while (0) + + uint16_t TwoByteBuffer = 0; + uint32_t FourByteBuffer = 0; + uint64_t EightByteBuffer = 0; + + StringRef ReservedBytes; + StringRef Indent = "\t"; + + assert(Bytes.size() == 64); + DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8); + + switch (Cursor.tell()) { + case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET: + FourByteBuffer = DE.getU32(Cursor); + KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer + << '\n'; + return MCDisassembler::Success; + + case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET: + FourByteBuffer = DE.getU32(Cursor); + KdStream << Indent << ".amdhsa_private_segment_fixed_size " + << FourByteBuffer << '\n'; + return MCDisassembler::Success; + + case amdhsa::RESERVED0_OFFSET: + // 8 reserved bytes, must be 0. + EightByteBuffer = DE.getU64(Cursor); + if (EightByteBuffer) { + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET: + // KERNEL_CODE_ENTRY_BYTE_OFFSET + // So far no directive controls this for Code Object V3, so simply skip for + // disassembly. + DE.skip(Cursor, 8); + return MCDisassembler::Success; + + case amdhsa::RESERVED1_OFFSET: + // 20 reserved bytes, must be 0. + ReservedBytes = DE.getBytes(Cursor, 20); + for (int I = 0; I < 20; ++I) { + if (ReservedBytes[I] != 0) { + return MCDisassembler::Fail; + } + } + return MCDisassembler::Success; + + case amdhsa::COMPUTE_PGM_RSRC3_OFFSET: + // COMPUTE_PGM_RSRC3 + // - Only set for GFX10, GFX6-9 have this to be 0. + // - Currently no directives directly control this. + FourByteBuffer = DE.getU32(Cursor); + if (!isGFX10() && FourByteBuffer) { + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case amdhsa::COMPUTE_PGM_RSRC1_OFFSET: + FourByteBuffer = DE.getU32(Cursor); + if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) == + MCDisassembler::Fail) { + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case amdhsa::COMPUTE_PGM_RSRC2_OFFSET: + FourByteBuffer = DE.getU32(Cursor); + if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) == + MCDisassembler::Fail) { + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET: + using namespace amdhsa; + TwoByteBuffer = DE.getU16(Cursor); + + PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); + PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR); + PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); + PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); + PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + + if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) + return MCDisassembler::Fail; + + // Reserved for GFX9 + if (isGFX9() && + (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) { + return MCDisassembler::Fail; + } else if (isGFX10()) { + PRINT_DIRECTIVE(".amdhsa_wavefront_size32", + KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); + } + + if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1) + return MCDisassembler::Fail; + + return MCDisassembler::Success; + + case amdhsa::RESERVED2_OFFSET: + // 6 bytes from here are reserved, must be 0. + ReservedBytes = DE.getBytes(Cursor, 6); + for (int I = 0; I < 6; ++I) { + if (ReservedBytes[I] != 0) + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + default: + llvm_unreachable("Unhandled index. Case statements cover everything."); + return MCDisassembler::Fail; + } +#undef PRINT_DIRECTIVE +} + +MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor( + StringRef KdName, ArrayRef Bytes, uint64_t KdAddress) const { + // CP microcode requires the kernel descriptor to be 64 aligned. + if (Bytes.size() != 64 || KdAddress % 64 != 0) + return MCDisassembler::Fail; + + std::string Kd; + raw_string_ostream KdStream(Kd); + KdStream << ".amdhsa_kernel " << KdName << '\n'; + + DataExtractor::Cursor C(0); + while (C && C.tell() < Bytes.size()) { + MCDisassembler::DecodeStatus Status = + decodeKernelDescriptorDirective(C, Bytes, KdStream); + + cantFail(C.takeError()); + + if (Status == MCDisassembler::Fail) + return MCDisassembler::Fail; + } + KdStream << ".end_amdhsa_kernel\n"; + outs() << KdStream.str(); + return MCDisassembler::Success; +} + +Optional +AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, + ArrayRef Bytes, uint64_t Address, + raw_ostream &CStream) const { + // Right now only kernel descriptor needs to be handled. + // We ignore all other symbols for target specific handling. + // TODO: + // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code + // Object V2 and V3 when symbols are marked protected. + + // amd_kernel_code_t for Code Object V2. + if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) { + Size = 256; + return MCDisassembler::Fail; + } + + // Code Object V3 kernel descriptors. + StringRef Name = Symbol.Name; + if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) { + Size = 64; // Size = 64 regardless of success or failure. + return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address); + } + return None; +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/nop-data.ll b/llvm/test/CodeGen/AMDGPU/nop-data.ll --- a/llvm/test/CodeGen/AMDGPU/nop-data.ll +++ b/llvm/test/CodeGen/AMDGPU/nop-data.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - --mcpu=fiji | FileCheck %s ; CHECK: : -; CHECK-NEXT: s_endpgm +; CHECK: s_endpgm define amdgpu_kernel void @kernel0() align 256 { entry: ret void @@ -80,7 +80,7 @@ ; CHECK-EMPTY: ; CHECK-NEXT: : -; CHECK-NEXT: s_endpgm +; CHECK: s_endpgm define amdgpu_kernel void @kernel1(i32 addrspace(1)* addrspace(4)* %ptr.out) align 256 { entry: ret void diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-failure.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-failure.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-failure.s @@ -0,0 +1,37 @@ +;; Failure test. We create a malformed kernel descriptor (KD) by manually +;; setting the bytes, because one can't create a malformed KD using the +;; assembler directives. + +; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t.o + +; RUN: printf ".type my_kernel.kd, @object \nmy_kernel.kd:\n.size my_kernel.kd, 64\n" > %t1.sym_info +; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t.o \ +; RUN: | tail -n +9 > %t1.sym_content +; RUN: cat %t1.sym_info %t1.sym_content > %t1.s + +; RUN: llvm-mc %t1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t-re-assemble.o +; RUN: diff %t.o %t-re-assemble.o + +;; Test failure by setting one of the reserved bytes to non-zero value. + +.type my_kernel.kd, @object +.size my_kernel.kd, 64 +my_kernel.kd: + .long 0x00000000 ;; group_segment_fixed_size + .long 0x00000000 ;; private_segment_fixed_size + .quad 0x00FF000000000000 ;; reserved bytes. + .quad 0x0000000000000000 ;; kernel_code_entry_byte_offset, any value works. + + ;; 20 reserved bytes. + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .long 0x00000000 + + .long 0x00000000 ;; compute_PGM_RSRC3 + .long 0x00000000 ;; compute_PGM_RSRC1 + .long 0x00000000 ;; compute_PGM_RSRC2 + .short 0x0000 ;; additional fields. + + ;; 6 reserved bytes. + .long 0x0000000 + .short 0x0000 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s @@ -0,0 +1,49 @@ +;; Test disassembly for GRANULATED_WAVEFRONT_SGPR_COUNT in the kernel descriptor. + +; RUN: split-file %s %t.dir + +; RUN: llvm-mc %t.dir/1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +8 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1-re-assemble +; RUN: diff %t1 %t1-re-assemble + +; RUN: llvm-mc %t.dir/2.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +8 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2-re-assemble +; RUN: diff %t2 %t2-re-assemble + +; RUN: llvm-mc %t.dir/3.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +8 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3-re-assemble +; RUN: diff %t3 %t3-re-assemble + + +;--- 1.s +;; Only set next_free_sgpr. +.amdhsa_kernel my_kernel_1 + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 42 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 0 + .amdhsa_reserve_vcc 0 +.end_amdhsa_kernel + +;--- 2.s +;; Only set other directives. +.amdhsa_kernel my_kernel_2 + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 + .amdhsa_reserve_flat_scratch 1 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_reserve_vcc 1 +.end_amdhsa_kernel + +;--- 3.s +;; Set all affecting directives. +.amdhsa_kernel my_kernel_3 + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 35 + .amdhsa_reserve_flat_scratch 1 + .amdhsa_reserve_xnack_mask 1 + .amdhsa_reserve_vcc 1 +.end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s @@ -0,0 +1,36 @@ +;; Test disassembly for GRANULATED_WORKITEM_VGPR_COUNT in the kernel descriptor. + +; RUN: split-file %s %t.dir + +; RUN: llvm-mc %t.dir/1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +8 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1-re-assemble +; RUN: diff %t1 %t1-re-assemble + +; RUN: llvm-mc %t.dir/2.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +8 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2-re-assemble +; RUN: diff %t2 %t2-re-assemble + +; RUN: llvm-mc %t.dir/3.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +8 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3-re-assemble +; RUN: diff %t3 %t3-re-assemble + +;--- 1.s +.amdhsa_kernel my_kernel_1 + .amdhsa_next_free_vgpr 23 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel + +;--- 2.s +.amdhsa_kernel my_kernel_2 + .amdhsa_next_free_vgpr 14 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel + +;--- 3.s +.amdhsa_kernel my_kernel_3 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 0 +.end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s @@ -0,0 +1,58 @@ +;; Entirely zeroed kernel descriptor (for GFX10). + +; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj -o %t +; RUN: llvm-objdump -s -j .text %t | FileCheck --check-prefix=OBJDUMP %s + +;; TODO: +;; This file and kd-zeroed-raw.s should produce the same output for the kernel +;; descriptor - a block of 64 zeroed bytes. But looks like the assembler sets +;; the FWD_PROGRESS bit in COMPUTE_PGM_RSRC1 to 1 even when the directive +;; mentions 0 (see line 36). + +;; Check the raw bytes right now. + +; OBJDUMP: 0000 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0030 01000000 00000000 00000000 00000000 + +.amdhsa_kernel my_kernel + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_next_free_vgpr 8 + .amdhsa_reserve_vcc 0 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 0 + .amdhsa_next_free_sgpr 8 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 0 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 0 + .amdhsa_workgroup_processor_mode 0 + .amdhsa_memory_ordered 0 + .amdhsa_forward_progress 0 + .amdhsa_system_sgpr_private_segment_wavefront_offset 0 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .amdhsa_user_sgpr_private_segment_buffer 0 + .amdhsa_user_sgpr_dispatch_ptr 0 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 0 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_flat_scratch_init 0 + .amdhsa_user_sgpr_private_segment_size 0 + .amdhsa_wavefront_size32 0 +.end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx9.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx9.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx9.s @@ -0,0 +1,53 @@ +;; Entirely zeroed kernel descriptor (for GFX9). + +; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1 +; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t1 \ +; RUN: | tail -n +8 | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2 +; RUN: diff %t1 %t2 + +; RUN: llvm-objdump -s -j .text %t1 | FileCheck --check-prefix=OBJDUMP %s + +; OBJDUMP: 0000 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0030 00000000 00000000 00000000 00000000 + +;; This file and kd-zeroed-raw.s produce the same output for the kernel +;; descriptor - a block of 64 zeroed bytes. + +.amdhsa_kernel my_kernel + .amdhsa_group_segment_fixed_size 0 + .amdhsa_private_segment_fixed_size 0 + .amdhsa_next_free_vgpr 0 + .amdhsa_reserve_vcc 0 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_reserve_xnack_mask 0 + .amdhsa_next_free_sgpr 0 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 0 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 0 + .amdhsa_system_sgpr_private_segment_wavefront_offset 0 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 0 + .amdhsa_system_sgpr_workgroup_id_z 0 + .amdhsa_system_sgpr_workgroup_info 0 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_exception_fp_ieee_invalid_op 0 + .amdhsa_exception_fp_denorm_src 0 + .amdhsa_exception_fp_ieee_div_zero 0 + .amdhsa_exception_fp_ieee_overflow 0 + .amdhsa_exception_fp_ieee_underflow 0 + .amdhsa_exception_fp_ieee_inexact 0 + .amdhsa_exception_int_div_zero 0 + .amdhsa_user_sgpr_private_segment_buffer 0 + .amdhsa_user_sgpr_dispatch_ptr 0 + .amdhsa_user_sgpr_queue_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 0 + .amdhsa_user_sgpr_dispatch_id 0 + .amdhsa_user_sgpr_flat_scratch_init 0 + .amdhsa_user_sgpr_private_segment_size 0 +.end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-raw.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-raw.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-raw.s @@ -0,0 +1,41 @@ +; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1 +; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t1 \ +; RUN: | tail -n +8 | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2 +; RUN: llvm-objdump -s -j .text %t2 | FileCheck --check-prefix=OBJDUMP %s + +;; Not running lit-test over gfx10 (see kd-zeroed-gfx10.s for details). +;; kd-zeroed-raw.s and kd-zeroed-*.s should produce the same output for the +;; kernel descriptor - a block of 64 zeroed bytes. + +;; The disassembly will produce the contents of kd-zeroed-*.s which on being +;; assembled contains additional relocation info. A diff over the entire object +;; will fail in this case. So we check by looking the bytes in .text. + +; OBJDUMP: 0000 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +; OBJDUMP-NEXT: 0030 00000000 00000000 00000000 00000000 + +;; The entire object is zeroed out. + +.type my_kernel.kd, @object +.size my_kernel.kd, 64 +my_kernel.kd: + .long 0x00000000 ;; group_segment_fixed_size + .long 0x00000000 ;; private_segment_fixed_size + .quad 0x0000000000000000 ;; reserved bytes. + .quad 0x0000000000000000 ;; kernel_code_entry_byte_offset, any value works. + + ;; 20 reserved bytes. + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .long 0x00000000 + + .long 0x00000000 ;; compute_PGM_RSRC3 + .long 0x00000000 ;; compute_PGM_RSRC1 + .long 0x00000000 ;; compute_PGM_RSRC2 + .short 0x0000 ;; additional fields. + + ;; 6 reserved bytes. + .long 0x0000000 + .short 0x0000 diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -1854,23 +1854,6 @@ outs() << SectionName << ":\n"; } - if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) { - if (Symbols[SI].Type == ELF::STT_AMDGPU_HSA_KERNEL) { - // skip amd_kernel_code_t at the begining of kernel symbol (256 bytes) - Start += 256; - } - if (SI == SE - 1 || - Symbols[SI + 1].Type == ELF::STT_AMDGPU_HSA_KERNEL) { - // cut trailing zeroes at the end of kernel - // cut up to 256 bytes - const uint64_t EndAlign = 256; - const auto Limit = End - (std::min)(EndAlign, End - Start); - while (End > Limit && - *reinterpret_cast(&Bytes[End - 4]) == 0) - End -= 4; - } - } - outs() << '\n'; if (!NoLeadingAddr) outs() << format(Is64Bits ? "%016" PRIx64 " " : "%08" PRIx64 " ",