diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -173,6 +173,15 @@ }; #undef KERNEL_CODE_PROPERTY +// Kernarg preload specification. +#define KERNARG_PRELOAD_SPEC(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(KERNARG_PRELOAD_SPEC_##NAME, SHIFT, WIDTH) +enum : int32_t { + KERNARG_PRELOAD_SPEC(LENGTH, 0, 7), + KERNARG_PRELOAD_SPEC(OFFSET, 7, 9), +}; +#undef KERNARG_PRELOAD_SPEC + // Kernel descriptor. Must be kept backwards compatible. struct kernel_descriptor_t { uint32_t group_segment_fixed_size; @@ -185,7 +194,8 @@ uint32_t compute_pgm_rsrc1; uint32_t compute_pgm_rsrc2; uint16_t kernel_code_properties; - uint8_t reserved2[6]; + uint16_t kernarg_preload; + uint8_t reserved3[4]; }; enum : uint32_t { @@ -199,7 +209,8 @@ COMPUTE_PGM_RSRC1_OFFSET = 48, COMPUTE_PGM_RSRC2_OFFSET = 52, KERNEL_CODE_PROPERTIES_OFFSET = 56, - RESERVED2_OFFSET = 58, + KERNARG_PRELOAD_OFFSET = 58, + RESERVED3_OFFSET = 60 }; static_assert( @@ -233,8 +244,11 @@ static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) == KERNEL_CODE_PROPERTIES_OFFSET, "invalid offset for kernel_code_properties"); -static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET, - "invalid offset for reserved2"); +static_assert(offsetof(kernel_descriptor_t, kernarg_preload) == + KERNARG_PRELOAD_OFFSET, + "invalid offset for kernarg_preload"); +static_assert(offsetof(kernel_descriptor_t, reserved3) == RESERVED3_OFFSET, + "invalid offset for reserved3"); } // end namespace amdhsa } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -872,6 +872,12 @@ "Requires use of fract on arguments to trig instructions" >; +def FeatureKernargPreload : SubtargetFeature <"kernarg-preload", + "KernargPreload", + "true", + "Hardware supports preloading of kernel arguments in user SGPRs." +>; + // Alignment enforcement is controlled by a configuration register: // SH_MEM_CONFIG.alignment_mode def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode", @@ -1185,7 +1191,8 @@ FeatureAtomicBufferGlobalPkAddF16Insts, FeaturePackedTID, FullRate64Ops, - FeatureBackOffBarrier])>; + FeatureBackOffBarrier, + FeatureKernargPreload])>; def FeatureISAVersion9_0_C : FeatureSet< !listconcat(FeatureISAVersion9_0_Common.Features, @@ -1227,7 +1234,8 @@ FeaturePackedTID, FeatureArchitectedFlatScratch, FullRate64Ops, - FeatureBackOffBarrier]>; + FeatureBackOffBarrier, + FeatureKernargPreload]>; def FeatureISAVersion9_4_0 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1462,6 +1462,12 @@ return AMDGPU::getNSAMaxSize(getSTI()); } + unsigned getMaxNumUserSGPRs() const { + return AMDGPU::getMaxNumUserSGPRs(getSTI()); + } + + bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); } + AMDGPUTargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); return static_cast(TS); @@ -4931,6 +4937,8 @@ uint64_t NextFreeVGPR = 0; uint64_t AccumOffset = 0; uint64_t SharedVGPRCount = 0; + uint64_t PreloadLength = 0; + uint64_t PreloadOffset = 0; SMRange SGPRRange; uint64_t NextFreeSGPR = 0; @@ -4999,6 +5007,28 @@ Val, ValRange); if (Val) ImpliedUserSGPRCount += 4; + } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_length") { + if (!hasKernargPreload()) + return Error(IDRange.Start, "directive requires gfx90a+", IDRange); + + if (Val > getMaxNumUserSGPRs()) + return OutOfRangeError(ValRange); + PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, Val, + ValRange); + if (Val) { + ImpliedUserSGPRCount += Val; + PreloadLength = Val; + } + } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_offset") { + if (!hasKernargPreload()) + return Error(IDRange.Start, "directive requires gfx90a+", IDRange); + + if (Val >= 1024) + return OutOfRangeError(ValRange); + PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, Val, + ValRange); + if (Val) + PreloadOffset = Val; } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, @@ -5244,6 +5274,11 @@ AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, UserSGPRCount); + if (PreloadLength && KD.kernarg_size && + (PreloadLength * 4 + PreloadOffset * 4 > KD.kernarg_size)) + return TokError("Kernarg preload length + offset is larger than the " + "kernarg segment size"); + if (isGFX90A()) { if (!Seen.contains(".amdhsa_accum_offset")) return TokError(".amdhsa_accum_offset directive is required"); diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -249,6 +249,7 @@ bool isGFX11Plus() const; bool hasArchitectedFlatScratch() const; + bool hasKernargPreload() const; bool isMacDPP(MCInst &MI) const; }; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1621,6 +1621,10 @@ return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch); } +bool AMDGPUDisassembler::hasKernargPreload() const { + return AMDGPU::hasKernargPreload(STI); +} + //===----------------------------------------------------------------------===// // AMDGPU specific symbol handling //===----------------------------------------------------------------------===// @@ -1945,10 +1949,24 @@ return MCDisassembler::Success; - case amdhsa::RESERVED2_OFFSET: - // 6 bytes from here are reserved, must be 0. - ReservedBytes = DE.getBytes(Cursor, 6); - for (int I = 0; I < 6; ++I) { + case amdhsa::KERNARG_PRELOAD_OFFSET: + using namespace amdhsa; + TwoByteBuffer = DE.getU16(Cursor); + if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_LENGTH) { + PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length", + KERNARG_PRELOAD_SPEC_LENGTH); + } + + if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_OFFSET) { + PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset", + KERNARG_PRELOAD_SPEC_OFFSET); + } + return MCDisassembler::Success; + + case amdhsa::RESERVED3_OFFSET: + // 4 bytes from here are reserved, must be 0. + ReservedBytes = DE.getBytes(Cursor, 4); + for (int I = 0; I < 4; ++I) { if (ReservedBytes[I] != 0) return MCDisassembler::Fail; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -78,6 +78,7 @@ bool UnalignedAccessMode = false; bool HasApertureRegs = false; bool SupportsXNACK = false; + bool KernargPreload = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for XNACK. @@ -856,7 +857,7 @@ unsigned NumRegionInstrs) const override; unsigned getMaxNumUserSGPRs() const { - return 16; + return AMDGPU::getMaxNumUserSGPRs(*this); } bool hasSMemRealTime() const { @@ -1179,9 +1180,7 @@ bool hasLegacyGeometry() const { return getGeneration() < GFX11; } // \returns true if preloading kernel arguments is supported. - bool hasKernargPreload() const { - return hasGFX90AInsts() || hasGFX940Insts(); - } + bool hasKernargPreload() const { return KernargPreload; } // \returns true if we need to generate backwards compatible code when // preloading kernel arguments. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -368,6 +368,12 @@ PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + if (hasKernargPreload(STI)) { + PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_length ", KD, + kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_offset ", KD, + kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET); + } PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); @@ -906,6 +912,7 @@ Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1); Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2); Streamer.emitInt16(KernelDescriptor.kernel_code_properties); - for (uint8_t Res : KernelDescriptor.reserved2) + Streamer.emitInt16(KernelDescriptor.kernarg_preload); + for (uint8_t Res : KernelDescriptor.reserved3) Streamer.emitInt8(Res); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1148,6 +1148,7 @@ bool hasPackedD16(const MCSubtargetInfo &STI); bool hasGDS(const MCSubtargetInfo &STI); unsigned getNSAMaxSize(const MCSubtargetInfo &STI); +unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI); bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); @@ -1174,6 +1175,7 @@ bool hasMAIInsts(const MCSubtargetInfo &STI); bool hasVOPD(const MCSubtargetInfo &STI); int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); +unsigned hasKernargPreload(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2047,6 +2047,8 @@ return 0; } +unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; } + bool isSI(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureSouthernIslands); } @@ -2143,6 +2145,10 @@ return STI.hasFeature(AMDGPU::FeatureVOPD); } +unsigned hasKernargPreload(const MCSubtargetInfo &STI) { + return STI.hasFeature(AMDGPU::FeatureKernargPreload); +} + int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR) { if (has90AInsts && ArgNumAGPR) diff --git a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s --- a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s @@ -25,10 +25,10 @@ // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000 // complete -// OBJDUMP-NEXT: 0040 01000000 01000000 00000000 00000000 +// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100 -// OBJDUMP-NEXT: 0070 c1500104 1f0f007f 7f000000 00000000 +// OBJDUMP-NEXT: 0070 c1500104 210f007f 7f008100 00000000 .text // ASM: .text @@ -76,6 +76,9 @@ .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_kernarg_size 8 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_system_sgpr_private_segment_wavefront_offset 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -108,14 +111,16 @@ // ASM: .amdhsa_kernel complete // ASM-NEXT: .amdhsa_group_segment_fixed_size 1 // ASM-NEXT: .amdhsa_private_segment_fixed_size 1 -// ASM-NEXT: .amdhsa_kernarg_size 0 -// ASM-NEXT: .amdhsa_user_sgpr_count 15 +// ASM-NEXT: .amdhsa_kernarg_size 8 +// ASM-NEXT: .amdhsa_user_sgpr_count 16 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s --- a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s @@ -25,10 +25,10 @@ // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000 // complete -// OBJDUMP-NEXT: 0040 01000000 01000000 00000000 00000000 +// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100 -// OBJDUMP-NEXT: 0070 01510104 130f007f 5e000000 00000000 +// OBJDUMP-NEXT: 0070 01510104 150f007f 5e008100 00000000 .text // ASM: .text @@ -74,6 +74,9 @@ .amdhsa_user_sgpr_queue_ptr 1 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_kernarg_size 8 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_enable_private_segment 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -105,12 +108,14 @@ // ASM: .amdhsa_kernel complete // ASM-NEXT: .amdhsa_group_segment_fixed_size 1 // ASM-NEXT: .amdhsa_private_segment_fixed_size 1 -// ASM-NEXT: .amdhsa_kernarg_size 0 -// ASM-NEXT: .amdhsa_user_sgpr_count 9 +// ASM-NEXT: .amdhsa_kernarg_size 8 +// ASM-NEXT: .amdhsa_user_sgpr_count 10 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_enable_private_segment 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s --- a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s +++ b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s @@ -1,8 +1,9 @@ -// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx810 %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s +// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s .amdhsa_kernel implied_count_too_low_0 .amdhsa_user_sgpr_count 0 .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 // ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs @@ -11,7 +12,30 @@ .amdhsa_kernel implied_count_too_low_1 .amdhsa_user_sgpr_count 1 .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 // ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs .end_amdhsa_kernel + +.amdhsa_kernel implied_count_too_low_2 + .amdhsa_user_sgpr_count 2 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_accum_offset 4 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 +// ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs +.end_amdhsa_kernel + +.amdhsa_kernel preload_out_of_bounds_0 + .amdhsa_user_sgpr_count 4 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 + .amdhsa_kernarg_size 4 + .amdhsa_accum_offset 4 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 +// ERR: :[[@LINE+1]]:19: error: Kernarg preload length + offset is larger than the kernarg segment size +.end_amdhsa_kernel diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count.s b/llvm/test/MC/AMDGPU/user-sgpr-count.s --- a/llvm/test/MC/AMDGPU/user-sgpr-count.s +++ b/llvm/test/MC/AMDGPU/user-sgpr-count.s @@ -1,10 +1,10 @@ -// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s .text // ASM: .text -.amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack" -// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack" +.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc" // ASM-LABEL: .amdhsa_kernel user_sgprs_implied_count @@ -17,6 +17,7 @@ .amdhsa_user_sgpr_dispatch_id 1 .amdhsa_user_sgpr_flat_scratch_init 1 .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel @@ -28,6 +29,7 @@ .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_flat_scratch_init 1 .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel @@ -39,6 +41,7 @@ .amdhsa_user_sgpr_queue_ptr 1 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel @@ -48,6 +51,7 @@ // ASM: .amdhsa_user_sgpr_count 4 .amdhsa_kernel user_sgprs_implied_count_private_segment_buffer .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel @@ -56,6 +60,7 @@ // ASM-LABEL: .amdhsa_kernel explicit_user_sgpr_count_16 .amdhsa_kernel explicit_user_sgpr_count_16 .amdhsa_user_sgpr_count 16 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel @@ -65,6 +70,7 @@ // ASM: .amdhsa_user_sgpr_count 0 .amdhsa_kernel explicit_user_sgpr_count_0 .amdhsa_user_sgpr_count 0 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel @@ -73,6 +79,7 @@ // ASM: .amdhsa_user_sgpr_count 1 .amdhsa_kernel explicit_user_sgpr_count_1 .amdhsa_user_sgpr_count 1 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel @@ -82,6 +89,39 @@ .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_queue_ptr 1 .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_accum_offset 4 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 +.end_amdhsa_kernel + +.amdhsa_kernel preload_kernarg_0 + .amdhsa_user_sgpr_count 3 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 + .amdhsa_kernarg_size 8 + .amdhsa_accum_offset 4 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 +.end_amdhsa_kernel + +.amdhsa_kernel preload_kernarg_1 + .amdhsa_user_sgpr_count 3 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_preload_length 0 + .amdhsa_user_sgpr_kernarg_preload_offset 10 + .amdhsa_kernarg_size 0 + .amdhsa_accum_offset 4 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 +.end_amdhsa_kernel + +.amdhsa_kernel preload_kernarg_2 + .amdhsa_user_sgpr_count 3 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 0 + .amdhsa_accum_offset 4 .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s @@ -110,7 +110,7 @@ ; CHECK: .amdhsa_kernel kernel ; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 -; CHECK-NEXT: .amdhsa_kernarg_size 0 +; CHECK-NEXT: .amdhsa_kernarg_size 32 ; CHECK-NEXT: .amdhsa_accum_offset 12 ; CHECK-NEXT: .amdhsa_tg_split 0 ; CHECK-NEXT: .amdhsa_next_free_vgpr 32 @@ -145,9 +145,14 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 2 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 0 .amdhsa_accum_offset 12 + .amdhsa_kernarg_size 32 + .amdhsa_user_sgpr_kernarg_preload_length 2 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .end_amdhsa_kernel