diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -17,10 +17,11 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCDisassembler/MCRelocationInfo.h" #include "llvm/MC/MCDisassembler/MCSymbolizer.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/DataExtractor.h" #include #include @@ -66,6 +67,25 @@ DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst, uint64_t Address) const; + Optional onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, + ArrayRef Bytes, + uint64_t Address, + raw_ostream &CStream) const override; + + DecodeStatus decodeKernelDescriptor(StringRef KdName, ArrayRef Bytes, + uint64_t &Size, uint64_t KdAddress) const; + + DecodeStatus + decodeKernelDescriptorDirective(DataExtractor::Cursor &Cursor, + ArrayRef Bytes, uint64_t &Size, + raw_string_ostream &KdStream) const; + + DecodeStatus decodeCOMPUTE_PGM_RSRC1(uint32_t FourByteBuffer, + raw_string_ostream &KdStream) const; + + DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer, + raw_string_ostream &KdStream) const; + DecodeStatus convertSDWAInst(MCInst &MI) const; DecodeStatus convertDPP8Inst(MCInst &MI) const; DecodeStatus convertMIMGInst(MCInst &MI) const; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -34,6 +34,7 @@ #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -1215,6 +1216,353 @@ return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; } +//===----------------------------------------------------------------------===// +// AMDGPU specific symbol handling +//===----------------------------------------------------------------------===// +#define HANDLE_TRIVIAL_FIELD(DIRECTIVE, MASK) \ + do { \ + KdStream << Indent << DIRECTIVE " " \ + << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \ + } while (0) + +MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( + uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { + // Decode as directives that handle COMPUTE_PGM_RSRC1. + using namespace amdhsa; + StringRef Indent = "\t"; + + // We cannot accurately backward compute #VGPRs used from + // GRANULATED_WORKITEM_VGPR_COUNT. So we 'decode' as Code Object V3 predefined + // symbol. + KdStream << Indent << ".amdhsa_next_free_vgpr " + << ".amdgcn.next_free_vgpr" << '\n'; + + // We cannot backward compute values used to calculate + // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following + // directives can't be computed: + // .amdhsa_reserve_vcc + // .amdhsa_reserve_flat_scratch + // .amdhsa_reserve_xnack_mask + // They take their respective default values if not specified in assembly. + // + // GRANULATED_WAVEFRONT_SGPR_COUNT + // = NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK + // + // To to get the exact same bytes in re-assembled binary, we disassemble + // aamdhsa_next_free_sgpr as the amdgcn.next_free_sgpr assembler symbol and + // set the remaining directives to "0". + // + // So now we see : + // + // GRANULATED_WAVEFRONT_SGPR_COUNT + // = NEXT_FREE_SGPR + 0 + 0 + 0 + // + // The disassembler cannot recover the original values of those directives. + KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n'; + KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; + KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n'; + + KdStream << Indent << ".amdhsa_next_free_sgpr " + << ".amdgcn.next_free_sgpr\n"; + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY) + return MCDisassembler::Fail; + + HANDLE_TRIVIAL_FIELD(".amdhsa_float_round_mode_32", + COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32); + HANDLE_TRIVIAL_FIELD(".amdhsa_float_round_mode_16_64", + COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64); + HANDLE_TRIVIAL_FIELD(".amdhsa_float_denorm_mode_32", + COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32); + HANDLE_TRIVIAL_FIELD(".amdhsa_float_denorm_mode_16_64", + COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64); + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV) + return MCDisassembler::Fail; + + HANDLE_TRIVIAL_FIELD(".amdhsa_dx10_clamp", + COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP); + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE) + return MCDisassembler::Fail; + + HANDLE_TRIVIAL_FIELD(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE); + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY) + return MCDisassembler::Fail; + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER) + return MCDisassembler::Fail; + + HANDLE_TRIVIAL_FIELD(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL); + + if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0) + return MCDisassembler::Fail; + + HANDLE_TRIVIAL_FIELD(".amdhsa_workgroup_processor_mode", + COMPUTE_PGM_RSRC1_WGP_MODE); + HANDLE_TRIVIAL_FIELD(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED); + HANDLE_TRIVIAL_FIELD(".amdhsa_forward_progress", + COMPUTE_PGM_RSRC1_FWD_PROGRESS); + + return MCDisassembler::Success; +} // decodeCOMPUTE_PGM_RSRC1() + +MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2( + uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { + // Decode as directives that handle COMPUTE_PGM_RSRC2. + using namespace amdhsa; + StringRef Indent = "\t"; + HANDLE_TRIVIAL_FIELD( + ".amdhsa_system_sgpr_private_segment_wavefront_offset", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET); + HANDLE_TRIVIAL_FIELD(".amdhsa_system_sgpr_workgroup_id_x", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); + HANDLE_TRIVIAL_FIELD(".amdhsa_system_sgpr_workgroup_id_y", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y); + HANDLE_TRIVIAL_FIELD(".amdhsa_system_sgpr_workgroup_id_z", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z); + HANDLE_TRIVIAL_FIELD(".amdhsa_system_sgpr_workgroup_info", + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO); + HANDLE_TRIVIAL_FIELD(".amdhsa_system_vgpr_workitem_id", + COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID); + + if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH) + return MCDisassembler::Fail; + + if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY) + return MCDisassembler::Fail; + + if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE) + return MCDisassembler::Fail; + + HANDLE_TRIVIAL_FIELD( + ".amdhsa_exception_fp_ieee_invalid_op", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION); + HANDLE_TRIVIAL_FIELD(".amdhsa_exception_fp_denorm_src", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE); + HANDLE_TRIVIAL_FIELD( + ".amdhsa_exception_fp_ieee_div_zero", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO); + HANDLE_TRIVIAL_FIELD(".amdhsa_exception_fp_ieee_overflow", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW); + HANDLE_TRIVIAL_FIELD( + ".amdhsa_exception_fp_ieee_underflow", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW); + HANDLE_TRIVIAL_FIELD(".amdhsa_exception_fp_ieee_inexact", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT); + HANDLE_TRIVIAL_FIELD(".amdhsa_exception_int_div_zero", + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO); + + if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0) + return MCDisassembler::Fail; + + return MCDisassembler::Success; +} // decodeCOMPUTE_PGM_RSRC2() + +#undef HANDLE_TRIVIAL_FIELD + +MCDisassembler::DecodeStatus +AMDGPUDisassembler::decodeKernelDescriptorDirective( + DataExtractor::Cursor &Cursor, ArrayRef Bytes, uint64_t &Size, + raw_string_ostream &KdStream) const { +#define HANDLE_TRIVIAL_FIELD(DIRECTIVE, MASK) \ + do { \ + KdStream << Indent << DIRECTIVE " " \ + << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \ + } while (0) + + uint16_t TwoByteBuffer = 0; + uint32_t FourByteBuffer = 0; + uint64_t EightByteBuffer = 0; + + StringRef ReservedBytes; + StringRef Indent = "\t"; + + DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8); + // When we fail, we set: + // Size = current cursor position (starting point of the chunk of bytes) + // + length of the chunk. + // The failed region is from 0 to this new value of Size. We do this because + // most directives in the kernel descriptor affect a single or very few bits. + switch (Cursor.tell()) { + case 0: + FourByteBuffer = DE.getU32(Cursor); + KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer + << '\n'; + return MCDisassembler::Success; + + case 4: // 0 + 4 + FourByteBuffer = DE.getU32(Cursor); + KdStream << Indent << ".amdhsa_private_segment_fixed_size " + << FourByteBuffer << '\n'; + return MCDisassembler::Success; + + case 8: // 4 + 4 + // 8 reserved bytes, must be 0. + EightByteBuffer = DE.getU64(Cursor); + if (EightByteBuffer) { + Size = 8 + 8; + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case 16: // 8 + 8 + // KERNEL_CODE_ENTRY_BYTE_OFFSET + // So far no directive controls this for Code Object V3, so simply skip for + // disassembly. + DE.getU64(Cursor); + return MCDisassembler::Success; + + case 24: // 16 + 8 + // 20 reserved bytes, must be 0. + ReservedBytes = DE.getBytes(Cursor, 20); + for (int I = 0; I < 20; ++I) { + if (ReservedBytes[I] != 0) { + Size = 24 + 20; + return MCDisassembler::Fail; + } + } + return MCDisassembler::Success; + + case 44: // 24 + 20 + // COMPUTE_PGM_RSRC3 + // - Only set for GFX10, GFX6-9 have this to be 0. + // - Currently no directives directly control this. + FourByteBuffer = DE.getU32(Cursor); + if (!isGFX10() && FourByteBuffer) { + Size = 44 + 4; + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case 48: // 44 + 4 + // COMPUTE_PGM_RSRC1 + FourByteBuffer = DE.getU32(Cursor); + if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) == + MCDisassembler::Fail) { + Size = 48 + 4; + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case 52: // 48 + 4 + // COMPUTE_PGM_RSRC2 + FourByteBuffer = DE.getU32(Cursor); + if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) == + MCDisassembler::Fail) { + Size = 52 + 4; + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case 56: // 52 + 4 + using namespace amdhsa; + TwoByteBuffer = DE.getU16(Cursor); + + HANDLE_TRIVIAL_FIELD( + ".amdhsa_user_sgpr_private_segment_buffer", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + HANDLE_TRIVIAL_FIELD(".amdhsa_user_sgpr_dispatch_ptr", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); + HANDLE_TRIVIAL_FIELD(".amdhsa_user_sgpr_queue_ptr", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR); + HANDLE_TRIVIAL_FIELD(".amdhsa_user_sgpr_kernarg_segment_ptr", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); + HANDLE_TRIVIAL_FIELD(".amdhsa_user_sgpr_dispatch_id", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); + HANDLE_TRIVIAL_FIELD(".amdhsa_user_sgpr_flat_scratch_init", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + HANDLE_TRIVIAL_FIELD(".amdhsa_user_sgpr_private_segment_size", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + + // Next 3 bits are reserved, must be 0. + if ((TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) >> + KERNEL_CODE_PROPERTY_RESERVED0_SHIFT) { + Size = 56 + 2; + return MCDisassembler::Fail; + } + + HANDLE_TRIVIAL_FIELD(".amdhsa_wavefront_size32", + KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); + + // Rest of the bits are reserved and must be 0. + if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1) { + Size = 56 + 2; + return MCDisassembler::Fail; + } + return MCDisassembler::Success; + + case 58: // 56 + 2 + // 6 bytes from here are reserved, must be 0. + ReservedBytes = DE.getBytes(Cursor, 6); + for (int I = 0; I < 6; ++I) { + if (ReservedBytes[I] != 0) { + Size = 58 + 6; + return MCDisassembler::Fail; + } + } + // 58 + 6 = 64. End of kernel descriptor. + return MCDisassembler::Success; + } +#undef HANDLE_TRIVIAL_FIELD +} // decodeKernelDescriptorDirective() + +MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor( + StringRef KdName, ArrayRef Bytes, uint64_t &Size, + uint64_t KdAddress) const { + // CP microcode requires the kernel descriptor to be 64 aligned. + if (Bytes.size() != 64 || KdAddress % 64 != 0) + return MCDisassembler::Fail; + + std::string Kd; + raw_string_ostream KdStream(Kd); + KdStream << ".amdhsa_kernel " << KdName.drop_back(3).str() << '\n'; + + DataExtractor::Cursor C(0); + while (C && C.tell() < Bytes.size()) { + MCDisassembler::DecodeStatus Status = + decodeKernelDescriptorDirective(C, Bytes, Size, KdStream); + + if (Status == MCDisassembler::Fail) + return MCDisassembler::Fail; + } + KdStream << ".end_amdhsa_kernel\n"; + outs() << KdStream.str(); + return MCDisassembler::Success; +} + +Optional +AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, + ArrayRef Bytes, uint64_t Address, + raw_ostream &CStream) const { + // Right now only kernel descriptor needs to be handled. + // We ignore all other symbols for target specific handling. + // TODO: + // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code + // Object V2 and V3. + + // amd_kernel_code_t for Code Object V2. + // Right now this condition will always evaluate to false due to above + // mentioned issue. + if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) { + Size = 256; + return MCDisassembler::SoftFail; + } + + // Code Object V3 kernel descriptors. + StringRef Name = Symbol.Name; + if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) { + if (decodeKernelDescriptor(Name, Bytes, Size, Address) == + MCDisassembler::Success) { + Size = Bytes.size(); + return MCDisassembler::Success; + } + return MCDisassembler::Fail; + } + return None; +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/code-object-v3.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/code-object-v3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/code-object-v3.ll @@ -0,0 +1,50 @@ +; This file is used to test disassembly for the entire AMDGPU ELF object (V3 only). +; Right now we only check for kernel descriptors. AMDGPU note records will follow next. +; Eventually we will check disassembly for everything that is in the in an AMDGPU ELF object. + +; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+code-object-v3 -filetype=obj -amdgpu-early-inline-all=true -amdgpu-function-calls=false -o %t.o +; RUN: llvm-objdump --triple=amdgcn-amd-amdhsa --mcpu=gfx908 -D %t.o | FileCheck --check-prefix=CHECK %s + +; CHECK: .amdhsa_kernel my_kernel +; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0 +; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 +; CHECK-NEXT: .amdhsa_next_free_vgpr .amdgcn.next_free_vgpr +; CHECK-NEXT: .amdhsa_reserve_vcc 0 +; CHECK-NEXT: .amdhsa_reserve_flat_scratch 0 +; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0 +; CHECK-NEXT: .amdhsa_next_free_sgpr .amdgcn.next_free_sgpr +; CHECK-NEXT: .amdhsa_float_round_mode_32 0 +; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0 +; CHECK-NEXT: .amdhsa_float_denorm_mode_32 3 +; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; CHECK-NEXT: .amdhsa_dx10_clamp 1 +; CHECK-NEXT: .amdhsa_ieee_mode 1 +; CHECK-NEXT: .amdhsa_fp16_overflow 0 +; CHECK-NEXT: .amdhsa_workgroup_processor_mode 0 +; CHECK-NEXT: .amdhsa_memory_ordered 0 +; CHECK-NEXT: .amdhsa_forward_progress 0 +; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id: 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; CHECK-NEXT: .amdhsa_exception_int_div_zero 0 +; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_wavefront_size32: 0 +; CHECK-NEXT: .end_amdhsa_kernel +define amdgpu_kernel void @my_kernel() { + ret void +} diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -1396,23 +1396,6 @@ outs() << SectionName << ":\n"; } - if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) { - if (Symbols[SI].Type == ELF::STT_AMDGPU_HSA_KERNEL) { - // skip amd_kernel_code_t at the begining of kernel symbol (256 bytes) - Start += 256; - } - if (SI == SE - 1 || - Symbols[SI + 1].Type == ELF::STT_AMDGPU_HSA_KERNEL) { - // cut trailing zeroes at the end of kernel - // cut up to 256 bytes - const uint64_t EndAlign = 256; - const auto Limit = End - (std::min)(EndAlign, End - Start); - while (End > Limit && - *reinterpret_cast(&Bytes[End - 4]) == 0) - End -= 4; - } - } - outs() << '\n'; if (!NoLeadingAddr) outs() << format(Is64Bits ? "%016" PRIx64 " " : "%08" PRIx64 " ",