diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -9,6 +9,13 @@ /// \file /// AMDHSA kernel descriptor definitions. For more information, visit /// https://llvm.org/docs/AMDGPUUsage.html#kernel-descriptor +/// +/// \warning +/// Any changes to this file should also be audited for corresponding changes +/// needed in both the assembler and disassembler, namely: +/// * AMDGPUAsmPrinter.{cpp,h} +/// * AMDGPUTargetStreamer.{cpp,h} +/// * AMDGPUDisassembler.{cpp,h} // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5212,7 +5212,7 @@ if (IVersion.Major == 10) { // SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS - if (SharedVGPRCount && EnableWavefrontSize32) { + if (SharedVGPRCount && EnableWavefrontSize32 && *EnableWavefrontSize32) { return TokError("shared_vgpr_count directive not valid on " "wavefront size 32"); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -24,6 +24,7 @@ namespace llvm { +class MCAsmInfo; class MCInst; class MCOperand; class MCSubtargetInfo; @@ -91,10 +92,12 @@ private: std::unique_ptr const MCII; const MCRegisterInfo &MRI; + const MCAsmInfo &MAI; const unsigned TargetMaxInstBytes; mutable ArrayRef Bytes; mutable uint32_t Literal; mutable bool HasLiteral; + mutable Optional EnableWavefrontSize32; public: AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, @@ -156,6 +159,13 @@ DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer, raw_string_ostream &KdStream) const; + /// Decode as directives that handle COMPUTE_PGM_RSRC3. + /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC3. + /// \param KdStream - Stream to write the disassembled directives to. + // NOLINTNEXTLINE(readability-identifier-naming) + DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer, + raw_string_ostream &KdStream) const; + DecodeStatus convertEXPInst(MCInst &MI) const; DecodeStatus convertVINTERPInst(MCInst &MI) const; DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -45,11 +45,9 @@ using DecodeStatus = llvm::MCDisassembler::DecodeStatus; AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, - MCContext &Ctx, - MCInstrInfo const *MCII) : - MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), - TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) { - + MCContext &Ctx, MCInstrInfo const *MCII) + : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), + MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)) { // ToDo: AMDGPUDisassembler supports only VI ISA. if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10Plus()) report_fatal_error("Disassembly not yet supported for subtarget"); @@ -1684,10 +1682,15 @@ //===----------------------------------------------------------------------===// // AMDGPU specific symbol handling //===----------------------------------------------------------------------===// +#define GET_FIELD(MASK) (AMDHSA_BITS_GET(FourByteBuffer, MASK)) #define PRINT_DIRECTIVE(DIRECTIVE, MASK) \ do { \ - KdStream << Indent << DIRECTIVE " " \ - << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \ + KdStream << Indent << DIRECTIVE " " << GET_FIELD(MASK) << '\n'; \ + } while (0) +#define PRINT_PSEUDO_DIRECTIVE_COMMENT(DIRECTIVE, MASK) \ + do { \ + KdStream << Indent << MAI.getCommentString() << ' ' << DIRECTIVE " " \ + << GET_FIELD(MASK) << '\n'; \ } while (0) // NOLINTNEXTLINE(readability-identifier-naming) @@ -1702,11 +1705,11 @@ // simply calculate the inverse of what the assembler does. uint32_t GranulatedWorkitemVGPRCount = - (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >> - COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT; + GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT); - uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) * - AMDGPU::IsaInfo::getVGPREncodingGranule(&STI); + uint32_t NextFreeVGPR = + (GranulatedWorkitemVGPRCount + 1) * + AMDGPU::IsaInfo::getVGPREncodingGranule(&STI, EnableWavefrontSize32); KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n'; @@ -1730,8 +1733,7 @@ // The disassembler cannot recover the original values of those 3 directives. uint32_t GranulatedWavefrontSGPRCount = - (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >> - COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT; + GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT); if (isGFX10Plus() && GranulatedWavefrontSGPRCount) return MCDisassembler::Fail; @@ -1841,7 +1843,46 @@ return MCDisassembler::Success; } +// NOLINTNEXTLINE(readability-identifier-naming) +MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3( + uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { + using namespace amdhsa; + StringRef Indent = "\t"; + if (isGFX90A()) { + KdStream << Indent << ".amdhsa_accum_offset " + << (GET_FIELD(COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4 + << '\n'; + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED0) + return MCDisassembler::Fail; + PRINT_DIRECTIVE(".amdhsa_tg_split", COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT); + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED1) + return MCDisassembler::Fail; + } else if (isGFX10Plus()) { + if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) { + PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count", + COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT); + } else { + PRINT_PSEUDO_DIRECTIVE_COMMENT( + "SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT); + } + PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE", + COMPUTE_PGM_RSRC3_GFX10_PLUS_INST_PREF_SIZE); + PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START", + COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START); + PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END", + COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_END); + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED0) + return MCDisassembler::Fail; + PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP", + COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START); + } else if (FourByteBuffer) { + return MCDisassembler::Fail; + } + return MCDisassembler::Success; +} +#undef PRINT_PSEUDO_DIRECTIVE_COMMENT #undef PRINT_DIRECTIVE +#undef GET_FIELD MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptorDirective( @@ -1909,30 +1950,16 @@ return MCDisassembler::Success; case amdhsa::COMPUTE_PGM_RSRC3_OFFSET: - // COMPUTE_PGM_RSRC3 - // - Only set for GFX10, GFX6-9 have this to be 0. - // - Currently no directives directly control this. FourByteBuffer = DE.getU32(Cursor); - if (!isGFX10Plus() && FourByteBuffer) { - return MCDisassembler::Fail; - } - return MCDisassembler::Success; + return decodeCOMPUTE_PGM_RSRC3(FourByteBuffer, KdStream); case amdhsa::COMPUTE_PGM_RSRC1_OFFSET: FourByteBuffer = DE.getU32(Cursor); - if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) == - MCDisassembler::Fail) { - return MCDisassembler::Fail; - } - return MCDisassembler::Success; + return decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream); case amdhsa::COMPUTE_PGM_RSRC2_OFFSET: FourByteBuffer = DE.getU32(Cursor); - if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) == - MCDisassembler::Fail) { - return MCDisassembler::Fail; - } - return MCDisassembler::Success; + return decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream); case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET: using namespace amdhsa; @@ -1994,6 +2021,20 @@ if (Bytes.size() != 64 || KdAddress % 64 != 0) return MCDisassembler::Fail; + // FIXME: We can't actually decode "in order" as is done below, as e.g. GFX10 + // requires us to know the setting of .amdhsa_wavefront_size32 in order to + // accurately produce .amdhsa_next_free_vgpr, and they appear in the wrong + // order. Workaround this by first looking up .amdhsa_wavefront_size32 here + // when required. + if (isGFX10Plus()) { + uint16_t KernelCodeProperties = support::endian::read16( + (const void *)&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET], + support::endianness::little); + EnableWavefrontSize32 = + AMDHSA_BITS_GET(KernelCodeProperties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); + } + std::string Kd; raw_string_ostream KdStream(Kd); KdStream << ".amdhsa_kernel " << KdName << '\n'; diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s @@ -0,0 +1,240 @@ +;; Test disassembly for gfx10 kernel descriptor. + +; RUN: split-file %s %t.dir + +; RUN: llvm-mc %t.dir/1.s --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize32,-wavefrontsize64 -filetype=obj -o %t1 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +7 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize32,-wavefrontsize64 -filetype=obj -o %t1-re-assemble +; RUN: diff %t1 %t1-re-assemble +; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +7 | FileCheck --check-prefixes=KERNEL1 %s + +; RUN: llvm-mc %t.dir/2.s --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize64,-wavefrontsize32 -filetype=obj -o %t2 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +7 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize64,-wavefrontsize32 -filetype=obj -o %t2-re-assemble +; RUN: diff %t2 %t2-re-assemble +; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +7 | FileCheck --check-prefixes=KERNEL2 %s + +; RUN: llvm-mc %t.dir/3.s --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize64,-wavefrontsize32 -filetype=obj -o %t3 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +7 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize64,-wavefrontsize32 -filetype=obj -o %t3-re-assemble +; RUN: diff %t3 %t3-re-assemble +; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +7 | FileCheck --check-prefixes=KERNEL3 %s + +; RUN: llvm-mc %t.dir/4.s --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize64,-wavefrontsize32 -filetype=obj -o %t4 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_4.kd %t4 | tail -n +7 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize64,-wavefrontsize32 -filetype=obj -o %t4-re-assemble +; RUN: diff %t4 %t4-re-assemble +; RUN: llvm-objdump --disassemble-symbols=my_kernel_4.kd %t4 | tail -n +7 | FileCheck --check-prefixes=KERNEL4 %s + +;--- 1.s +; KERNEL1-LABEL: .amdhsa_kernel my_kernel_1 +; KERNEL1-NEXT: .amdhsa_group_segment_fixed_size 0 +; KERNEL1-NEXT: .amdhsa_private_segment_fixed_size 0 +; KERNEL1-NEXT: .amdhsa_kernarg_size 0 +; KERNEL1-NEXT: ; SHARED_VGPR_COUNT 0 +; KERNEL1-NEXT: ; INST_PREF_SIZE 0 +; KERNEL1-NEXT: ; TRAP_ON_START 0 +; KERNEL1-NEXT: ; TRAP_ON_END 0 +; KERNEL1-NEXT: ; IMAGE_OP 0 +; KERNEL1-NEXT: .amdhsa_next_free_vgpr 32 +; KERNEL1-NEXT: .amdhsa_reserve_vcc 0 +; KERNEL1-NEXT: .amdhsa_reserve_flat_scratch 0 +; KERNEL1-NEXT: .amdhsa_reserve_xnack_mask 0 +; KERNEL1-NEXT: .amdhsa_next_free_sgpr 8 +; KERNEL1-NEXT: .amdhsa_float_round_mode_32 0 +; KERNEL1-NEXT: .amdhsa_float_round_mode_16_64 0 +; KERNEL1-NEXT: .amdhsa_float_denorm_mode_32 0 +; KERNEL1-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; KERNEL1-NEXT: .amdhsa_dx10_clamp 1 +; KERNEL1-NEXT: .amdhsa_ieee_mode 1 +; KERNEL1-NEXT: .amdhsa_fp16_overflow 0 +; KERNEL1-NEXT: .amdhsa_workgroup_processor_mode 1 +; KERNEL1-NEXT: .amdhsa_memory_ordered 1 +; KERNEL1-NEXT: .amdhsa_forward_progress 0 +; KERNEL1-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; KERNEL1-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; KERNEL1-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; KERNEL1-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; KERNEL1-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; KERNEL1-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; KERNEL1-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; KERNEL1-NEXT: .amdhsa_exception_fp_denorm_src 0 +; KERNEL1-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; KERNEL1-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; KERNEL1-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; KERNEL1-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; KERNEL1-NEXT: .amdhsa_exception_int_div_zero 0 +; KERNEL1-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0 +; KERNEL1-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; KERNEL1-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; KERNEL1-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; KERNEL1-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; KERNEL1-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; KERNEL1-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; KERNEL1-NEXT: .amdhsa_wavefront_size32 1 +; KERNEL1-NEXT: .end_amdhsa_kernel +.amdhsa_kernel my_kernel_1 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 + .amdhsa_wavefront_size32 1 +.end_amdhsa_kernel + +;--- 2.s +; KERNEL2-LABEL: .amdhsa_kernel my_kernel_2 +; KERNEL2-NEXT: .amdhsa_group_segment_fixed_size 0 +; KERNEL2-NEXT: .amdhsa_private_segment_fixed_size 0 +; KERNEL2-NEXT: .amdhsa_kernarg_size 0 +; KERNEL2-NEXT: .amdhsa_shared_vgpr_count 0 +; KERNEL2-NEXT: ; INST_PREF_SIZE 0 +; KERNEL2-NEXT: ; TRAP_ON_START 0 +; KERNEL2-NEXT: ; TRAP_ON_END 0 +; KERNEL2-NEXT: ; IMAGE_OP 0 +; KERNEL2-NEXT: .amdhsa_next_free_vgpr 32 +; KERNEL2-NEXT: .amdhsa_reserve_vcc 0 +; KERNEL2-NEXT: .amdhsa_reserve_flat_scratch 0 +; KERNEL2-NEXT: .amdhsa_reserve_xnack_mask 0 +; KERNEL2-NEXT: .amdhsa_next_free_sgpr 8 +; KERNEL2-NEXT: .amdhsa_float_round_mode_32 0 +; KERNEL2-NEXT: .amdhsa_float_round_mode_16_64 0 +; KERNEL2-NEXT: .amdhsa_float_denorm_mode_32 0 +; KERNEL2-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; KERNEL2-NEXT: .amdhsa_dx10_clamp 1 +; KERNEL2-NEXT: .amdhsa_ieee_mode 1 +; KERNEL2-NEXT: .amdhsa_fp16_overflow 0 +; KERNEL2-NEXT: .amdhsa_workgroup_processor_mode 1 +; KERNEL2-NEXT: .amdhsa_memory_ordered 1 +; KERNEL2-NEXT: .amdhsa_forward_progress 0 +; KERNEL2-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; KERNEL2-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; KERNEL2-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; KERNEL2-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; KERNEL2-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; KERNEL2-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; KERNEL2-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; KERNEL2-NEXT: .amdhsa_exception_fp_denorm_src 0 +; KERNEL2-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; KERNEL2-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; KERNEL2-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; KERNEL2-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; KERNEL2-NEXT: .amdhsa_exception_int_div_zero 0 +; KERNEL2-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0 +; KERNEL2-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; KERNEL2-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; KERNEL2-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; KERNEL2-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; KERNEL2-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; KERNEL2-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; KERNEL2-NEXT: .amdhsa_wavefront_size32 0 +; KERNEL2-NEXT: .end_amdhsa_kernel +.amdhsa_kernel my_kernel_2 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 + .amdhsa_shared_vgpr_count 0 +.end_amdhsa_kernel + +;--- 3.s +; KERNEL3-LABEL: .amdhsa_kernel my_kernel_3 +; KERNEL3-NEXT: .amdhsa_group_segment_fixed_size 0 +; KERNEL3-NEXT: .amdhsa_private_segment_fixed_size 0 +; KERNEL3-NEXT: .amdhsa_kernarg_size 0 +; KERNEL3-NEXT: .amdhsa_shared_vgpr_count 1 +; KERNEL3-NEXT: ; INST_PREF_SIZE 0 +; KERNEL3-NEXT: ; TRAP_ON_START 0 +; KERNEL3-NEXT: ; TRAP_ON_END 0 +; KERNEL3-NEXT: ; IMAGE_OP 0 +; KERNEL3-NEXT: .amdhsa_next_free_vgpr 32 +; KERNEL3-NEXT: .amdhsa_reserve_vcc 0 +; KERNEL3-NEXT: .amdhsa_reserve_flat_scratch 0 +; KERNEL3-NEXT: .amdhsa_reserve_xnack_mask 0 +; KERNEL3-NEXT: .amdhsa_next_free_sgpr 8 +; KERNEL3-NEXT: .amdhsa_float_round_mode_32 0 +; KERNEL3-NEXT: .amdhsa_float_round_mode_16_64 0 +; KERNEL3-NEXT: .amdhsa_float_denorm_mode_32 0 +; KERNEL3-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; KERNEL3-NEXT: .amdhsa_dx10_clamp 1 +; KERNEL3-NEXT: .amdhsa_ieee_mode 1 +; KERNEL3-NEXT: .amdhsa_fp16_overflow 0 +; KERNEL3-NEXT: .amdhsa_workgroup_processor_mode 1 +; KERNEL3-NEXT: .amdhsa_memory_ordered 1 +; KERNEL3-NEXT: .amdhsa_forward_progress 0 +; KERNEL3-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; KERNEL3-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; KERNEL3-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; KERNEL3-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; KERNEL3-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; KERNEL3-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; KERNEL3-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; KERNEL3-NEXT: .amdhsa_exception_fp_denorm_src 0 +; KERNEL3-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; KERNEL3-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; KERNEL3-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; KERNEL3-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; KERNEL3-NEXT: .amdhsa_exception_int_div_zero 0 +; KERNEL3-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0 +; KERNEL3-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; KERNEL3-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; KERNEL3-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; KERNEL3-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; KERNEL3-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; KERNEL3-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; KERNEL3-NEXT: .amdhsa_wavefront_size32 0 +; KERNEL3-NEXT: .end_amdhsa_kernel +.amdhsa_kernel my_kernel_3 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 + .amdhsa_shared_vgpr_count 1 +.end_amdhsa_kernel + +;--- 4.s +; KERNEL4-LABEL: .amdhsa_kernel my_kernel_4 +; KERNEL4-NEXT: .amdhsa_group_segment_fixed_size 0 +; KERNEL4-NEXT: .amdhsa_private_segment_fixed_size 0 +; KERNEL4-NEXT: .amdhsa_kernarg_size 0 +; KERNEL4-NEXT: .amdhsa_shared_vgpr_count 1 +; KERNEL4-NEXT: ; INST_PREF_SIZE 0 +; KERNEL4-NEXT: ; TRAP_ON_START 0 +; KERNEL4-NEXT: ; TRAP_ON_END 0 +; KERNEL4-NEXT: ; IMAGE_OP 0 +; KERNEL4-NEXT: .amdhsa_next_free_vgpr 32 +; KERNEL4-NEXT: .amdhsa_reserve_vcc 0 +; KERNEL4-NEXT: .amdhsa_reserve_flat_scratch 0 +; KERNEL4-NEXT: .amdhsa_reserve_xnack_mask 0 +; KERNEL4-NEXT: .amdhsa_next_free_sgpr 8 +; KERNEL4-NEXT: .amdhsa_float_round_mode_32 0 +; KERNEL4-NEXT: .amdhsa_float_round_mode_16_64 0 +; KERNEL4-NEXT: .amdhsa_float_denorm_mode_32 0 +; KERNEL4-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; KERNEL4-NEXT: .amdhsa_dx10_clamp 1 +; KERNEL4-NEXT: .amdhsa_ieee_mode 1 +; KERNEL4-NEXT: .amdhsa_fp16_overflow 0 +; KERNEL4-NEXT: .amdhsa_workgroup_processor_mode 1 +; KERNEL4-NEXT: .amdhsa_memory_ordered 1 +; KERNEL4-NEXT: .amdhsa_forward_progress 0 +; KERNEL4-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; KERNEL4-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; KERNEL4-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; KERNEL4-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; KERNEL4-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; KERNEL4-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; KERNEL4-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; KERNEL4-NEXT: .amdhsa_exception_fp_denorm_src 0 +; KERNEL4-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; KERNEL4-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; KERNEL4-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; KERNEL4-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; KERNEL4-NEXT: .amdhsa_exception_int_div_zero 0 +; KERNEL4-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0 +; KERNEL4-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; KERNEL4-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; KERNEL4-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; KERNEL4-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; KERNEL4-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; KERNEL4-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; KERNEL4-NEXT: .amdhsa_wavefront_size32 0 +; KERNEL4-NEXT: .end_amdhsa_kernel +.amdhsa_kernel my_kernel_4 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 + .amdhsa_shared_vgpr_count 1 + .amdhsa_wavefront_size32 0 +.end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s @@ -0,0 +1,49 @@ +;; Test disassembly for gfx90a kernel descriptor. + +; RUN: llvm-mc %s --triple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -filetype=obj -o - \ +; RUN: | llvm-objdump --disassemble-all - | FileCheck %s + +; RUN: split-file %s %t.dir + +; RUN: llvm-mc %t.dir/1.s --triple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -filetype=obj -o %t1 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +7 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -filetype=obj -o %t1-re-assemble +; RUN: diff %t1 %t1-re-assemble + +; RUN: llvm-mc %t.dir/2.s --triple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -filetype=obj -o %t2 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +7 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -filetype=obj -o %t2-re-assemble +; RUN: diff %t2 %t2-re-assemble + +; RUN: llvm-mc %t.dir/3.s --triple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -filetype=obj -o %t3 +; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +7 \ +; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -filetype=obj -o %t3-re-assemble +; RUN: diff %t3 %t3-re-assemble + + +;--- 1.s +; CHECK-LABEL: .amdhsa_kernel my_kernel_1 +; CHECK: .amdhsa_accum_offset 4 +.amdhsa_kernel my_kernel_1 + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 + .amdhsa_accum_offset 4 +.end_amdhsa_kernel + +;--- 2.s +; CHECK-LABEL: .amdhsa_kernel my_kernel_2 +; CHECK: .amdhsa_accum_offset 8 +.amdhsa_kernel my_kernel_2 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 0 + .amdhsa_accum_offset 8 +.end_amdhsa_kernel + +;--- 3.s +; CHECK-LABEL: .amdhsa_kernel my_kernel_3 +; CHECK: .amdhsa_accum_offset 12 +.amdhsa_kernel my_kernel_3 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 0 + .amdhsa_accum_offset 12 +.end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s @@ -1,7 +1,7 @@ ;; Entirely zeroed kernel descriptor (for GFX10). ; RUN: llvm-mc %s --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack -filetype=obj -o %t -; RUN: llvm-objdump -s -j .text %t | FileCheck --check-prefix=OBJDUMP %s +; RUN: llvm-objdump -d -s -j .text %t | FileCheck --check-prefix=OBJDUMP %s ;; TODO: ;; This file and kd-zeroed-raw.s should produce the same output for the kernel @@ -11,10 +11,62 @@ ;; Check the raw bytes right now. -; OBJDUMP: 0000 00000000 00000000 00000000 00000000 +; OBJDUMP-LABEL: Contents of section .text: +; OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 ; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ; OBJDUMP-NEXT: 0030 01000000 00000000 00000000 00000000 +; OBJDUMP-EMPTY: + +; OBJDUMP-LABEL: Disassembly of section .text: +; OBJDUMP-EMPTY: +; OBJDUMP-NEXT: 0000000000000000 : +; OBJDUMP-NEXT: .amdhsa_kernel my_kernel +; OBJDUMP-NEXT: .amdhsa_group_segment_fixed_size 0 +; OBJDUMP-NEXT: .amdhsa_private_segment_fixed_size 0 +; OBJDUMP-NEXT: .amdhsa_kernarg_size 0 +; OBJDUMP-NEXT: .amdhsa_shared_vgpr_count 0 +; OBJDUMP-NEXT: ; INST_PREF_SIZE 0 +; OBJDUMP-NEXT: ; TRAP_ON_START 0 +; OBJDUMP-NEXT: ; TRAP_ON_END 0 +; OBJDUMP-NEXT: ; IMAGE_OP 0 +; OBJDUMP-NEXT: .amdhsa_next_free_vgpr 8 +; OBJDUMP-NEXT: .amdhsa_reserve_vcc 0 +; OBJDUMP-NEXT: .amdhsa_reserve_flat_scratch 0 +; OBJDUMP-NEXT: .amdhsa_reserve_xnack_mask 0 +; OBJDUMP-NEXT: .amdhsa_next_free_sgpr 8 +; OBJDUMP-NEXT: .amdhsa_float_round_mode_32 0 +; OBJDUMP-NEXT: .amdhsa_float_round_mode_16_64 0 +; OBJDUMP-NEXT: .amdhsa_float_denorm_mode_32 0 +; OBJDUMP-NEXT: .amdhsa_float_denorm_mode_16_64 0 +; OBJDUMP-NEXT: .amdhsa_dx10_clamp 0 +; OBJDUMP-NEXT: .amdhsa_ieee_mode 0 +; OBJDUMP-NEXT: .amdhsa_fp16_overflow 0 +; OBJDUMP-NEXT: .amdhsa_workgroup_processor_mode 0 +; OBJDUMP-NEXT: .amdhsa_memory_ordered 0 +; OBJDUMP-NEXT: .amdhsa_forward_progress 0 +; OBJDUMP-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; OBJDUMP-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +; OBJDUMP-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; OBJDUMP-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; OBJDUMP-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; OBJDUMP-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; OBJDUMP-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; OBJDUMP-NEXT: .amdhsa_exception_fp_denorm_src 0 +; OBJDUMP-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; OBJDUMP-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; OBJDUMP-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; OBJDUMP-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; OBJDUMP-NEXT: .amdhsa_exception_int_div_zero 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; OBJDUMP-NEXT: .amdhsa_wavefront_size32 0 +; OBJDUMP-NEXT: .end_amdhsa_kernel .amdhsa_kernel my_kernel .amdhsa_group_segment_fixed_size 0 diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -1410,7 +1410,8 @@ // if (Status.hasValue()) { if (Status.getValue() == MCDisassembler::Fail) { - outs() << "// Error in decoding " << SymbolName + outs() << Ctx.getAsmInfo()->getCommentString() + << " Error in decoding " << SymbolName << " : Decoding failed region as bytes.\n"; for (uint64_t I = 0; I < Size; ++I) { outs() << "\t.byte\t " << format_hex(Bytes[I], 1, /*Upper=*/true)