With this fix attr-amdgpu-num-sgpr.ll lit test started to fail, but I don't know what it is actually testing so I don't know how to fix it.
Below is a diff of the test output after the fix:
*** before Sat Jan 21 17:06:55 2017
--- after Sat Jan 21 16:52:19 2017
***************
*** 21,27 ****
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
! granulated_workitem_vgpr_count = 0
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
--- 21,27 ----
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
! granulated_workitem_vgpr_count = 1
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
***************
*** 29,35 ****
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
! enable_sgpr_private_segment_wave_byte_offset = 1
user_sgpr_count = 6
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
--- 29,35 ----
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
! enable_sgpr_private_segment_wave_byte_offset = 0
user_sgpr_count = 6
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
***************
*** 55,67 ****
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
! workitem_private_segment_byte_size = 20
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 48
workgroup_fbarrier_count = 0
! wavefront_sgpr_count = 14
! workitem_vgpr_count = 3
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
--- 55,67 ----
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
! workitem_private_segment_byte_size = 0
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 48
workgroup_fbarrier_count = 0
! wavefront_sgpr_count = 9
! workitem_vgpr_count = 5
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
***************
*** 76,142 ****
runtime_loader_kernel_symbol = 0
.end_amd_kernel_code_t
; BB#0:
! s_mov_b64 s[10:11], s[2:3]
! s_mov_b64 s[8:9], s[0:1]
! s_load_dwordx2 s[0:1], s[4:5], 0x10
! s_add_u32 m0, s7, 0x200
! s_load_dwordx2 s[2:3], s[4:5], 0x0
! s_load_dword s6, s[4:5], 0x20
! s_load_dwordx2 vcc, s[4:5], 0x8
! s_nop 0
! s_waitcnt lgkmcnt(0)
! s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
! s_waitcnt lgkmcnt(0)
s_load_dwordx2 s[0:1], s[4:5], 0x18
- s_mov_b32 m0, s7
- v_mov_b32_e32 v0, s2
- v_mov_b32_e32 v1, s3
- v_mov_b32_e32 v2, s6
- s_waitcnt lgkmcnt(0)
- s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
- s_waitcnt lgkmcnt(0)
- s_load_dword s0, s[4:5], 0x24
s_waitcnt lgkmcnt(0)
! flat_store_dword v[0:1], v2
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, vcc_lo
v_mov_b32_e32 v1, vcc_hi
! s_add_u32 m0, s7, 0x200
! v_mov_b32_e32 v2, s0
! flat_store_dword v[0:1], v2
! s_buffer_load_dwordx2 s[2:3], s[8:11], m0 ; 8-byte Folded Reload
! s_load_dword s1, s[4:5], 0x28
! s_mov_b32 m0, s7
! s_load_dword s4, s[4:5], 0x2c
! s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, s2
! v_mov_b32_e32 v1, s3
! v_mov_b32_e32 v2, s1
flat_store_dword v[0:1], v2
- s_buffer_load_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Reload
s_waitcnt vmcnt(0) lgkmcnt(0)
- v_mov_b32_e32 v2, s4
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
flat_store_dword v[0:1], v2
- s_dcache_wb
s_endpgm
.Lfunc_end0:
.size max_12_sgprs, .Lfunc_end0-max_12_sgprs
.section .AMDGPU.csdata
; Kernel info:
! ; codeLenInByte = 256
! ; NumSgprs: 14
! ; NumVgprs: 3
; FloatMode: 192
; IeeeMode: 1
! ; ScratchSize: 20
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
! ; VGPRBlocks: 0
! ; NumSGPRsForWavesPerEU: 14
! ; NumVGPRsForWavesPerEU: 3
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
--- 76,127 ----
runtime_loader_kernel_symbol = 0
.end_amd_kernel_code_t
; BB#0:
! s_load_dwordx2 s[2:3], s[4:5], 0x8
! s_load_dwordx2 s[0:1], s[4:5], 0x0
! s_load_dwordx2 vcc, s[4:5], 0x10
! s_load_dword s6, s[4:5], 0x28
! s_waitcnt lgkmcnt(0)
! v_mov_b32_e32 v2, s2
! v_mov_b32_e32 v3, s3
! s_load_dword s2, s[4:5], 0x20
! s_load_dword s3, s[4:5], 0x24
! v_mov_b32_e32 v0, s0
! v_mov_b32_e32 v1, s1
s_load_dwordx2 s[0:1], s[4:5], 0x18
s_waitcnt lgkmcnt(0)
! v_mov_b32_e32 v4, s2
! s_load_dword s4, s[4:5], 0x2c
! flat_store_dword v[0:1], v4
! s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, s3
! flat_store_dword v[2:3], v0
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, vcc_lo
v_mov_b32_e32 v1, vcc_hi
! v_mov_b32_e32 v2, s6
flat_store_dword v[0:1], v2
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
+ v_mov_b32_e32 v2, s4
flat_store_dword v[0:1], v2
s_endpgm
.Lfunc_end0:
.size max_12_sgprs, .Lfunc_end0-max_12_sgprs
.section .AMDGPU.csdata
; Kernel info:
! ; codeLenInByte = 168
! ; NumSgprs: 9
! ; NumVgprs: 5
; FloatMode: 192
; IeeeMode: 1
! ; ScratchSize: 0
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
! ; VGPRBlocks: 1
! ; NumSGPRsForWavesPerEU: 9
! ; NumVGPRsForWavesPerEU: 5
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
***************
*** 159,165 ****
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
! granulated_workitem_vgpr_count = 0
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
--- 144,150 ----
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
! granulated_workitem_vgpr_count = 2
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
***************
*** 167,173 ****
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
! enable_sgpr_private_segment_wave_byte_offset = 1
user_sgpr_count = 12
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
--- 152,158 ----
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
! enable_sgpr_private_segment_wave_byte_offset = 0
user_sgpr_count = 12
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
***************
*** 193,205 ****
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
! workitem_private_segment_byte_size = 40
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 48
workgroup_fbarrier_count = 0
wavefront_sgpr_count = 16
! workitem_vgpr_count = 4
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
--- 178,190 ----
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
! workitem_private_segment_byte_size = 0
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 48
workgroup_fbarrier_count = 0
wavefront_sgpr_count = 16
! workitem_vgpr_count = 11
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
***************
*** 216,316 ****
; BB#0:
s_mov_b64 s[10:11], s[2:3]
s_mov_b64 s[8:9], s[0:1]
! s_load_dwordx2 s[0:1], s[8:9], 0x8
s_mov_b32 s7, s13
! s_add_u32 m0, s7, 0x700
! s_buffer_store_dwordx2 s[6:7], s[8:11], m0 ; 8-byte Folded Spill
! s_add_u32 m0, s7, 0x500
! s_waitcnt lgkmcnt(0)
! s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
! s_waitcnt lgkmcnt(0)
! s_load_dwordx2 s[0:1], s[8:9], 0x10
! s_add_u32 m0, s7, 0x200
! s_load_dword vcc_lo, s[8:9], 0x2c
v_mov_b32_e32 v0, s10
! v_mov_b32_e32 v2, s12
! s_waitcnt lgkmcnt(0)
! s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
! s_waitcnt lgkmcnt(0)
! s_load_dwordx2 s[0:1], s[8:9], 0x18
! s_mov_b32 m0, s7
v_mov_b32_e32 v1, s11
! v_mov_b32_e32 v3, 0
! s_load_dwordx2 s[2:3], s[8:9], 0x0
! s_load_dword s6, s[8:9], 0x20
! s_nop 0
s_waitcnt lgkmcnt(0)
! s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
! s_add_u32 m0, s7, 0x400
! s_buffer_store_dword vcc_lo, s[8:11], m0 ; 4-byte Folded Spill
! buffer_store_dword v3, v0, s[8:11], s7 offen
s_nop 0
! flat_store_dword v[0:1], v2
s_nop 0
! flat_store_dword v[0:1], v2
! s_nop 0
! flat_store_dword v[0:1], v2
s_nop 0
flat_store_dwordx2 v[0:1], v[0:1]
s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, s4
! v_mov_b32_e32 v1, s5
! flat_store_dwordx2 v[0:1], v[0:1]
! s_add_u32 m0, s7, 0x700
! s_buffer_load_dwordx2 s[4:5], s[8:11], m0 ; 8-byte Folded Reload
! v_mov_b32_e32 v2, s6
! s_add_u32 m0, s7, 0x500
! s_load_dword s0, s[8:9], 0x24
! s_load_dword s1, s[8:9], 0x28
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s4
v_mov_b32_e32 v1, s5
! flat_store_dwordx2 v[0:1], v[0:1]
! s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, s2
! v_mov_b32_e32 v1, s3
flat_store_dword v[0:1], v2
- s_buffer_load_dwordx2 s[2:3], s[8:11], m0 ; 8-byte Folded Reload
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_mov_b32_e32 v2, s0
- s_add_u32 m0, s7, 0x200
- v_mov_b32_e32 v0, s2
- v_mov_b32_e32 v1, s3
- flat_store_dword v[0:1], v2
- s_buffer_load_dwordx2 s[2:3], s[8:11], m0 ; 8-byte Folded Reload
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_mov_b32_e32 v2, s1
- s_mov_b32 m0, s7
- v_mov_b32_e32 v0, s2
- v_mov_b32_e32 v1, s3
- flat_store_dword v[0:1], v2
- s_buffer_load_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Reload
- s_add_u32 m0, s7, 0x400
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
! s_buffer_load_dword s0, s[8:11], m0 ; 4-byte Folded Reload
! s_waitcnt lgkmcnt(0)
! v_mov_b32_e32 v2, s0
flat_store_dword v[0:1], v2
- s_dcache_wb
s_endpgm
.Lfunc_end1:
.size max_12_sgprs_14_input_sgprs, .Lfunc_end1-max_12_sgprs_14_input_sgprs
.section .AMDGPU.csdata
; Kernel info:
! ; codeLenInByte = 476
; NumSgprs: 16
! ; NumVgprs: 4
; FloatMode: 192
; IeeeMode: 1
! ; ScratchSize: 40
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
! ; VGPRBlocks: 0
; NumSGPRsForWavesPerEU: 16
! ; NumVGPRsForWavesPerEU: 4
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 12
--- 201,275 ----
; BB#0:
s_mov_b64 s[10:11], s[2:3]
s_mov_b64 s[8:9], s[0:1]
! s_load_dwordx2 s[2:3], s[8:9], 0x0
s_mov_b32 s7, s13
! s_mov_b64 s[0:1], s[6:7]
! v_mov_b32_e32 v5, s1
! v_mov_b32_e32 v4, s0
! s_load_dword s0, s[8:9], 0x20
! s_waitcnt lgkmcnt(0)
! v_mov_b32_e32 v7, s3
! s_load_dwordx2 vcc, s[8:9], 0x8
! v_mov_b32_e32 v6, s2
! v_mov_b32_e32 v2, s4
! s_load_dword s2, s[8:9], 0x24
! v_mov_b32_e32 v3, s5
! s_load_dwordx2 s[4:5], s[8:9], 0x10
v_mov_b32_e32 v0, s10
! s_load_dword s3, s[8:9], 0x28
! v_mov_b32_e32 v9, s0
! v_mov_b32_e32 v10, 0
! v_mov_b32_e32 v8, s12
v_mov_b32_e32 v1, s11
! s_load_dwordx2 s[0:1], s[8:9], 0x18
! buffer_store_dword v10, v0, s[8:11], s7 offen
s_waitcnt lgkmcnt(0)
! flat_store_dword v[0:1], v8
s_nop 0
! flat_store_dword v[0:1], v8
s_nop 0
! flat_store_dword v[0:1], v8
s_nop 0
flat_store_dwordx2 v[0:1], v[0:1]
+ s_nop 0
+ flat_store_dwordx2 v[0:1], v[2:3]
+ s_nop 0
+ flat_store_dwordx2 v[0:1], v[4:5]
s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, vcc_lo
! s_load_dword s6, s[8:9], 0x2c
! v_mov_b32_e32 v1, vcc_hi
! v_mov_b32_e32 v2, s2
! flat_store_dword v[6:7], v9
! s_nop 0
! flat_store_dword v[0:1], v2
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s4
v_mov_b32_e32 v1, s5
! v_mov_b32_e32 v2, s3
flat_store_dword v[0:1], v2
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
! v_mov_b32_e32 v2, s6
flat_store_dword v[0:1], v2
s_endpgm
.Lfunc_end1:
.size max_12_sgprs_14_input_sgprs, .Lfunc_end1-max_12_sgprs_14_input_sgprs
.section .AMDGPU.csdata
; Kernel info:
! ; codeLenInByte = 296
; NumSgprs: 16
! ; NumVgprs: 11
; FloatMode: 192
; IeeeMode: 1
! ; ScratchSize: 0
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
! ; VGPRBlocks: 2
; NumSGPRsForWavesPerEU: 16
! ; NumVGPRsForWavesPerEU: 11
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 12