diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl --- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl +++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl @@ -2,7 +2,7 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null // expected-remark@+9 {{Function Name: foo}} -// expected-remark@+8 {{ SGPRs: 9}} +// expected-remark@+8 {{ SGPRs: 13}} // expected-remark@+7 {{ VGPRs: 10}} // expected-remark@+6 {{ AGPRs: 12}} // expected-remark@+5 {{ ScratchSize [bytes/lane]: 0}} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -251,9 +251,9 @@ STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), CurrentProgramInfo.NumVGPRsForWavesPerEU, CurrentProgramInfo.NumSGPRsForWavesPerEU - - IsaInfo::getNumExtraSGPRs(&STM, - CurrentProgramInfo.VCCUsed, - CurrentProgramInfo.FlatUsed), + IsaInfo::getNumExtraSGPRs( + &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny()), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, CodeObjectVersion); @@ -721,7 +721,8 @@ // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); + &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny()); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3061,7 +3061,7 @@ ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 9 +; GPRIDX-NEXT: wavefront_sgpr_count = 13 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -3913,7 +3913,7 @@ ; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 ; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 ; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 -; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 0 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 ; GPRIDX-NEXT: priority = 0 ; GPRIDX-NEXT: float_mode = 240 ; GPRIDX-NEXT: priv = 0 @@ -3956,7 +3956,7 @@ ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 6 +; GPRIDX-NEXT: wavefront_sgpr_count = 10 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4259,7 +4259,7 @@ ; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 ; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 ; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 -; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 0 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 ; GPRIDX-NEXT: priority = 0 ; GPRIDX-NEXT: float_mode = 240 ; GPRIDX-NEXT: priv = 0 @@ -4302,7 +4302,7 @@ ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 7 +; GPRIDX-NEXT: wavefront_sgpr_count = 11 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s -; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdpal -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s +; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mattr=-xnack -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s declare amdgpu_gfx float @extern_func(float) #0 declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0 diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1010,GFX1010W32 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack < %s | FileCheck --check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1010,GFX1010W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -2,7 +2,7 @@ ; RUN: FileCheck -check-prefix=REMARK %s < %t ; STDERR: remark: foo.cl:27:0: Function Name: test_kernel -; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 24 +; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28 ; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9 ; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43 ; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0 @@ -27,7 +27,7 @@ ; REMARK-NEXT: Function: test_kernel ; REMARK-NEXT: Args: ; REMARK-NEXT: - String: ' SGPRs: ' -; REMARK-NEXT: - NumSGPR: '24' +; REMARK-NEXT: - NumSGPR: '28' ; REMARK-NEXT: ... ; REMARK-NEXT: --- !Analysis ; REMARK-NEXT: Pass: kernel-resource-usage @@ -120,7 +120,7 @@ } ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel -; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 0 +; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4 ; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s + +; TODO: Update to check for granulated sgpr count directive once one is added. + +define amdgpu_kernel void @kern() { +; ASM-LABEL: kern: +; ASM: .amdhsa_next_free_sgpr 5 +; ASM: .amdhsa_reserve_xnack_mask 1 + +; Verify that an extra SGPR block is reserved with XNACK "any" tid setting. +; OBJ: Contents of section .rodata: +; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... + +; ELF: AMDGPU Metadata +; ELF: .sgpr_count: 9 +entry: + tail call void asm sideeffect "", "~{s[0:4]}"() + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s + +; TODO: Update to check for granulated sgpr count directive once one is added. + +define amdgpu_kernel void @kern() { +; ASM-LABEL: kern: +; ASM: .amdhsa_next_free_sgpr 5 +; ASM: .amdhsa_reserve_xnack_mask 0 + +; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting. +; OBJ: Contents of section .rodata: +; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................ + +; ELF: AMDGPU Metadata +; ELF: .sgpr_count: 5 +entry: + tail call void asm sideeffect "", "~{s[0:4]}"() + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s + +; TODO: Update to check for granulated sgpr count directive once one is added. + +define amdgpu_kernel void @kern() { +; ASM-LABEL: kern: +; ASM: .amdhsa_next_free_sgpr 5 +; ASM: .amdhsa_reserve_xnack_mask 1 + +; Verify that an extra SGPR block is reserved with XNACK "on" tid setting. +; OBJ: Contents of section .rodata: +; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ +; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... + +; ELF: AMDGPU Metadata +; ELF: .sgpr_count: 9 +entry: + tail call void asm sideeffect "", "~{s[0:4]}"() + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -17,7 +17,75 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-V2-LABEL: trap: -; NOHSA-TRAP-GFX900-V2: ; %bb.0: +; NOHSA-TRAP-GFX900-V2: .amd_kernel_code_t +; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_major = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_minor = 2 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_kind = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_major = 9 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_minor = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_stepping = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256 +; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: priority = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: float_mode = 240 +; NOHSA-TRAP-GFX900-V2-NEXT: priv = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_dx10_clamp = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_mode = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_ieee_mode = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_wgp_mode = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_mem_ordered = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_fwd_progress = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_trap_handler = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_z = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_info = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_vgpr_workitem_id = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception_msb = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_lds_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_wavefront_size32 = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_ordered_append_gds = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: private_element_size = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: is_ptr64 = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: is_dynamic_callstack = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: is_debug_enabled = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: is_xnack_enabled = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: workitem_private_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_group_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 44 +; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8 +; NOHSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 2 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_first = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_private_segment_buffer_sgpr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernarg_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: group_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: private_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: wavefront_size = 6 +; NOHSA-TRAP-GFX900-V2-NEXT: call_convention = -1 +; NOHSA-TRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t +; NOHSA-TRAP-GFX900-V2-NEXT: ; %bb.0: ; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 @@ -161,7 +229,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256 ; HSA-TRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0 ; HSA-TRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0 -; HSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0 +; HSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 1 ; HSA-TRAP-GFX900-V2-NEXT: priority = 0 ; HSA-TRAP-GFX900-V2-NEXT: float_mode = 240 ; HSA-TRAP-GFX900-V2-NEXT: priv = 0 @@ -204,7 +272,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8 +; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 12 ; HSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 2 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -261,7 +329,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256 ; HSA-NOTRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0 +; HSA-NOTRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 1 ; HSA-NOTRAP-GFX900-V2-NEXT: priority = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: float_mode = 240 ; HSA-NOTRAP-GFX900-V2-NEXT: priv = 0 @@ -304,7 +372,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-NOTRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8 +; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 12 ; HSA-NOTRAP-GFX900-V2-NEXT: workitem_vgpr_count = 2 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -356,7 +424,75 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { ; NOHSA-TRAP-GFX900-V2-LABEL: non_entry_trap: -; NOHSA-TRAP-GFX900-V2: ; %bb.0: ; %entry +; NOHSA-TRAP-GFX900-V2: .amd_kernel_code_t +; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_major = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_minor = 2 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_kind = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_major = 9 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_minor = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_stepping = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256 +; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: priority = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: float_mode = 240 +; NOHSA-TRAP-GFX900-V2-NEXT: priv = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_dx10_clamp = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_mode = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_ieee_mode = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_wgp_mode = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_mem_ordered = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_fwd_progress = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_trap_handler = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_z = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_info = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_vgpr_workitem_id = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception_msb = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_lds_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_wavefront_size32 = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_ordered_append_gds = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: private_element_size = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: is_ptr64 = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: is_dynamic_callstack = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: is_debug_enabled = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: is_xnack_enabled = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: workitem_private_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_group_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 44 +; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8 +; NOHSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 2 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_first = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_private_segment_buffer_sgpr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernarg_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: group_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: private_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: wavefront_size = 6 +; NOHSA-TRAP-GFX900-V2-NEXT: call_convention = -1 +; NOHSA-TRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t +; NOHSA-TRAP-GFX900-V2-NEXT: ; %bb.0: ; %entry ; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V2-NEXT: s_waitcnt lgkmcnt(0) @@ -591,7 +727,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 10 +; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 12 ; HSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 2 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -712,7 +848,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-NOTRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 10 +; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 12 ; HSA-NOTRAP-GFX900-V2-NEXT: workitem_vgpr_count = 2 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -792,7 +928,75 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-V2-LABEL: debugtrap: -; NOHSA-TRAP-GFX900-V2: ; %bb.0: +; NOHSA-TRAP-GFX900-V2: .amd_kernel_code_t +; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_major = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_minor = 2 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_kind = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_major = 9 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_minor = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_stepping = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256 +; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: priority = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: float_mode = 240 +; NOHSA-TRAP-GFX900-V2-NEXT: priv = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_dx10_clamp = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_mode = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_ieee_mode = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_wgp_mode = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_mem_ordered = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_fwd_progress = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 2 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_trap_handler = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_z = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_info = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_vgpr_workitem_id = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception_msb = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: granulated_lds_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_wavefront_size32 = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: enable_ordered_append_gds = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: private_element_size = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: is_ptr64 = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: is_dynamic_callstack = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: is_debug_enabled = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: is_xnack_enabled = 1 +; NOHSA-TRAP-GFX900-V2-NEXT: workitem_private_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_group_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 44 +; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6 +; NOHSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_first = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_count = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: debug_private_segment_buffer_sgpr = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: kernarg_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: group_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: private_segment_alignment = 4 +; NOHSA-TRAP-GFX900-V2-NEXT: wavefront_size = 6 +; NOHSA-TRAP-GFX900-V2-NEXT: call_convention = -1 +; NOHSA-TRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0 +; NOHSA-TRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t +; NOHSA-TRAP-GFX900-V2-NEXT: ; %bb.0: ; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 @@ -954,7 +1158,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256 ; HSA-TRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0 ; HSA-TRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0 -; HSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0 +; HSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 1 ; HSA-TRAP-GFX900-V2-NEXT: priority = 0 ; HSA-TRAP-GFX900-V2-NEXT: float_mode = 240 ; HSA-TRAP-GFX900-V2-NEXT: priv = 0 @@ -997,7 +1201,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6 +; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 10 ; HSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -1064,7 +1268,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256 ; HSA-NOTRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0 +; HSA-NOTRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 1 ; HSA-NOTRAP-GFX900-V2-NEXT: priority = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: float_mode = 240 ; HSA-NOTRAP-GFX900-V2-NEXT: priv = 0 @@ -1107,7 +1311,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-NOTRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6 +; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 10 ; HSA-NOTRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0