diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -173,6 +173,15 @@ }; #undef KERNEL_CODE_PROPERTY +// Kernarg preload specification. +#define KERNARG_PRELOAD_SPEC(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(KERNARG_PRELOAD_SPEC_##NAME, SHIFT, WIDTH) +enum : int32_t { + KERNARG_PRELOAD_SPEC(LENGTH, 0, 7), + KERNARG_PRELOAD_SPEC(OFFSET, 7, 9), +}; +#undef KERNARG_PRELOAD_SPEC + // Kernel descriptor. Must be kept backwards compatible. struct kernel_descriptor_t { uint32_t group_segment_fixed_size; @@ -185,7 +194,8 @@ uint32_t compute_pgm_rsrc1; uint32_t compute_pgm_rsrc2; uint16_t kernel_code_properties; - uint8_t reserved2[6]; + uint16_t kernarg_preload; + uint8_t reserved3[4]; }; enum : uint32_t { @@ -199,7 +209,8 @@ COMPUTE_PGM_RSRC1_OFFSET = 48, COMPUTE_PGM_RSRC2_OFFSET = 52, KERNEL_CODE_PROPERTIES_OFFSET = 56, - RESERVED2_OFFSET = 58, + KERNARG_PRELOAD_OFFSET = 58, + RESERVED3_OFFSET = 60 }; static_assert( @@ -233,8 +244,11 @@ static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) == KERNEL_CODE_PROPERTIES_OFFSET, "invalid offset for kernel_code_properties"); -static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET, - "invalid offset for reserved2"); +static_assert(offsetof(kernel_descriptor_t, kernarg_preload) == + KERNARG_PRELOAD_OFFSET, + "invalid offset for kernarg_preload"); +static_assert(offsetof(kernel_descriptor_t, reserved3) == RESERVED3_OFFSET, + "invalid offset for reserved3"); } // end namespace amdhsa } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1462,6 +1462,10 @@ return AMDGPU::getNSAMaxSize(getSTI()); } + unsigned getMaxNumUserSGPRs() const { + return AMDGPU::getMaxNumUserSGPRs(getSTI()); + } + AMDGPUTargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); return static_cast(TS); @@ -4933,6 +4937,8 @@ // Count the number of user SGPRs implied from the enabled feature bits. unsigned ImpliedUserSGPRCount = 0; + // Track the number of SGPRs used to preload kernel agruments. + unsigned KernargPreloadSGPRCount = 0; // Track if the asm explicitly contains the directive for the user SGPR // count. @@ -4996,6 +5002,18 @@ Val, ValRange); if (Val) ImpliedUserSGPRCount += 4; + } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_length") { + if (Val > getMaxNumUserSGPRs()) + return OutOfRangeError(ValRange); + PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, Val, + ValRange); + if (Val) + KernargPreloadSGPRCount += Val; + } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_offset") { + if (Val >= 1024) + return OutOfRangeError(ValRange); + PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, Val, + ValRange); } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, @@ -5236,6 +5254,11 @@ unsigned UserSGPRCount = ExplicitUserSGPRCount ? *ExplicitUserSGPRCount : ImpliedUserSGPRCount; + if (KernargPreloadSGPRCount && + KernargPreloadSGPRCount + UserSGPRCount > getMaxNumUserSGPRs()) + return TokError(".amdhsa_user_sgpr_kernarg_preload_length + implicit user " + " SGPR count exceeds maximum supported."); + if (!isUInt(UserSGPRCount)) return TokError("too many user SGPRs enabled"); AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1945,10 +1945,19 @@ return MCDisassembler::Success; - case amdhsa::RESERVED2_OFFSET: - // 6 bytes from here are reserved, must be 0. - ReservedBytes = DE.getBytes(Cursor, 6); - for (int I = 0; I < 6; ++I) { + case amdhsa::KERNARG_PRELOAD_OFFSET: + using namespace amdhsa; + TwoByteBuffer = DE.getU16(Cursor); + PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length", + KERNARG_PRELOAD_SPEC_LENGTH); + PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset", + KERNARG_PRELOAD_SPEC_OFFSET); + return MCDisassembler::Success; + + case amdhsa::RESERVED3_OFFSET: + // 4 bytes from here are reserved, must be 0. + ReservedBytes = DE.getBytes(Cursor, 4); + for (int I = 0; I < 4; ++I) { if (ReservedBytes[I] != 0) return MCDisassembler::Fail; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -856,7 +856,7 @@ unsigned NumRegionInstrs) const override; unsigned getMaxNumUserSGPRs() const { - return 16; + return AMDGPU::getMaxNumUserSGPRs(*this); } bool hasSMemRealTime() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -368,6 +368,10 @@ PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_length ", KD, + kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_offset ", KD, + kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET); PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); @@ -906,6 +910,7 @@ Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1); Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2); Streamer.emitInt16(KernelDescriptor.kernel_code_properties); - for (uint8_t Res : KernelDescriptor.reserved2) + Streamer.emitInt16(KernelDescriptor.kernarg_preload); + for (uint8_t Res : KernelDescriptor.reserved3) Streamer.emitInt8(Res); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1148,6 +1148,7 @@ bool hasPackedD16(const MCSubtargetInfo &STI); bool hasGDS(const MCSubtargetInfo &STI); unsigned getNSAMaxSize(const MCSubtargetInfo &STI); +unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI); bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2047,6 +2047,8 @@ return 0; } +unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; } + bool isSI(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureSouthernIslands); } diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll --- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -34,6 +34,8 @@ ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GCN-NEXT: .amdhsa_wavefront_size32 ; GCN-NEXT: .amdhsa_enable_private_segment 0 @@ -62,6 +64,8 @@ ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GCN-NEXT: .amdhsa_wavefront_size32 ; GCN-NEXT: .amdhsa_enable_private_segment 1 @@ -94,6 +98,8 @@ ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GCN-NEXT: .amdhsa_wavefront_size32 ; GCN-NEXT: .amdhsa_enable_private_segment 0 @@ -140,6 +146,8 @@ ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GCN-NEXT: .amdhsa_wavefront_size32 ; GCN-NEXT: .amdhsa_enable_private_segment 1 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -25,6 +25,8 @@ ; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 @@ -73,6 +75,8 @@ ; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 @@ -128,6 +132,8 @@ ; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 @@ -176,6 +182,8 @@ ; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 @@ -231,6 +239,8 @@ ; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 @@ -279,6 +289,8 @@ ; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s --- a/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s @@ -31,7 +31,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0070 015001e4 1f0f007f 7f040000 00000000 +// OBJDUMP-NEXT: 0070 015001e4 1f0f007f 7f048100 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 @@ -89,6 +89,8 @@ .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_wavefront_size32 1 .amdhsa_system_sgpr_private_segment_wavefront_offset 1 @@ -132,6 +134,8 @@ // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_wavefront_size32 1 // ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s --- a/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s @@ -31,7 +31,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0070 015001e4 130f007f 5e040000 00000000 +// OBJDUMP-NEXT: 0070 015001e4 130f007f 5e048100 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 @@ -85,6 +85,8 @@ .amdhsa_user_sgpr_queue_ptr 1 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_wavefront_size32 1 .amdhsa_enable_private_segment 1 @@ -124,6 +126,8 @@ // ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_wavefront_size32 1 // ASM-NEXT: .amdhsa_enable_private_segment 1 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s --- a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s @@ -28,7 +28,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 00000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100 -// OBJDUMP-NEXT: 0070 c1500104 1f0f007f 7f000000 00000000 +// OBJDUMP-NEXT: 0070 c1500104 1f0f007f 7f008100 00000000 .text // ASM: .text @@ -76,6 +76,8 @@ .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_system_sgpr_private_segment_wavefront_offset 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -116,6 +118,8 @@ // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s --- a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s @@ -28,7 +28,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 00000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100 -// OBJDUMP-NEXT: 0070 01510104 130f007f 5e000000 00000000 +// OBJDUMP-NEXT: 0070 01510104 130f007f 5e008100 00000000 .text // ASM: .text @@ -74,6 +74,8 @@ .amdhsa_user_sgpr_queue_ptr 1 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_enable_private_segment 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -111,6 +113,8 @@ // ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_enable_private_segment 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/AMDGPU/hsa-v3.s b/llvm/test/MC/AMDGPU/hsa-v3.s --- a/llvm/test/MC/AMDGPU/hsa-v3.s +++ b/llvm/test/MC/AMDGPU/hsa-v3.s @@ -34,7 +34,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f000000 00000000 +// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f008100 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 @@ -100,6 +100,8 @@ .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_system_sgpr_private_segment_wavefront_offset 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -139,6 +141,8 @@ // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/AMDGPU/hsa-v4.s b/llvm/test/MC/AMDGPU/hsa-v4.s --- a/llvm/test/MC/AMDGPU/hsa-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-v4.s @@ -34,7 +34,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f000000 00000000 +// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f008100 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 @@ -101,6 +101,8 @@ .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_system_sgpr_private_segment_wavefront_offset 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -140,6 +142,8 @@ // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s b/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s --- a/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s +++ b/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s @@ -34,7 +34,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f080000 00000000 +// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f088100 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 @@ -101,6 +101,8 @@ .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 1 .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 .amdhsa_uses_dynamic_stack 1 .amdhsa_system_sgpr_private_segment_wavefront_offset 1 @@ -141,6 +143,8 @@ // ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 // ASM-NEXT: .amdhsa_uses_dynamic_stack 1 // ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s --- a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s +++ b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s @@ -15,3 +15,12 @@ .amdhsa_next_free_sgpr 32 // ERR: :[[@LINE+1]]:19: error: amdgpu_user_sgpr_count smaller than than implied by enabled user SGPRs .end_amdhsa_kernel + +.amdhsa_kernel implied_count_too_low_2 + .amdhsa_user_sgpr_count 16 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 +// ERR: :[[@LINE+1]]:19: error: .amdhsa_user_sgpr_kernarg_preload_length + implicit user SGPR count exceeds maximum supported. +.end_amdhsa_kernel diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count.s b/llvm/test/MC/AMDGPU/user-sgpr-count.s --- a/llvm/test/MC/AMDGPU/user-sgpr-count.s +++ b/llvm/test/MC/AMDGPU/user-sgpr-count.s @@ -85,3 +85,11 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .end_amdhsa_kernel + +.amdhsa_kernel preload_kernarg_0 + .amdhsa_user_sgpr_count 2 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_preload_length 1 + .amdhsa_next_free_vgpr 32 + .amdhsa_next_free_sgpr 32 +.end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s @@ -52,6 +52,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; CHECK-NEXT: .amdhsa_wavefront_size32 1 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 32 @@ -109,6 +111,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; CHECK-NEXT: .amdhsa_wavefront_size32 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 32 @@ -166,6 +170,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; CHECK-NEXT: .amdhsa_wavefront_size32 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 32 @@ -223,6 +229,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; CHECK-NEXT: .amdhsa_wavefront_size32 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 32 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s @@ -45,6 +45,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 0 @@ -95,6 +97,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 32 @@ -145,9 +149,13 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 2 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 0 .amdhsa_accum_offset 12 + .amdhsa_user_sgpr_kernarg_preload_length 2 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s @@ -44,6 +44,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 0 @@ -95,6 +97,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 0 @@ -146,6 +150,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 2 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 0 @@ -153,4 +159,6 @@ .amdhsa_reserve_flat_scratch 1 .amdhsa_reserve_xnack_mask 0 .amdhsa_reserve_vcc 1 + .amdhsa_user_sgpr_kernarg_preload_length 2 + .amdhsa_user_sgpr_kernarg_preload_offset 1 .end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s @@ -43,6 +43,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 23 @@ -90,6 +92,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 14 @@ -137,6 +141,8 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; CHECK-NEXT: .end_amdhsa_kernel .amdhsa_kernel kernel .amdhsa_next_free_vgpr 32 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s @@ -66,6 +66,8 @@ ; OBJDUMP-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 ; OBJDUMP-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; OBJDUMP-NEXT: .amdhsa_wavefront_size32 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 +; OBJDUMP-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 ; OBJDUMP-NEXT: .end_amdhsa_kernel .amdhsa_kernel my_kernel