This is an archive of the discontinued LLVM Phabricator instance.

Fix typo in GCNSchedStrategy
ClosedPublic

Authored by vpykhtin on Jan 21 2017, 6:25 AM.

Details

Summary

With this fix attr-amdgpu-num-sgpr.ll lit test started to fail, but I don't know what it is actually testing so I don't know how to fix it.

Below is a diff of the test output after the fix:

*** before Sat Jan 21 17:06:55 2017
--- after Sat Jan 21 16:52:19 2017
***************
*** 21,27 ****
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
! granulated_workitem_vgpr_count = 0
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
--- 21,27 ----
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
! granulated_workitem_vgpr_count = 1
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
***************
*** 29,35 ****
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
! enable_sgpr_private_segment_wave_byte_offset = 1
user_sgpr_count = 6
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
--- 29,35 ----
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
! enable_sgpr_private_segment_wave_byte_offset = 0
user_sgpr_count = 6
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
***************
*** 55,67 ****
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
! workitem_private_segment_byte_size = 20
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 48
workgroup_fbarrier_count = 0
! wavefront_sgpr_count = 14
! workitem_vgpr_count = 3
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
--- 55,67 ----
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
! workitem_private_segment_byte_size = 0
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 48
workgroup_fbarrier_count = 0
! wavefront_sgpr_count = 9
! workitem_vgpr_count = 5
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
***************
*** 76,142 ****
runtime_loader_kernel_symbol = 0
.end_amd_kernel_code_t
; BB#0:
! s_mov_b64 s[10:11], s[2:3]
! s_mov_b64 s[8:9], s[0:1]
! s_load_dwordx2 s[0:1], s[4:5], 0x10
! s_add_u32 m0, s7, 0x200
! s_load_dwordx2 s[2:3], s[4:5], 0x0
! s_load_dword s6, s[4:5], 0x20
! s_load_dwordx2 vcc, s[4:5], 0x8
! s_nop 0
! s_waitcnt lgkmcnt(0)
! s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
! s_waitcnt lgkmcnt(0)
s_load_dwordx2 s[0:1], s[4:5], 0x18
- s_mov_b32 m0, s7
- v_mov_b32_e32 v0, s2
- v_mov_b32_e32 v1, s3
- v_mov_b32_e32 v2, s6
- s_waitcnt lgkmcnt(0)
- s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
- s_waitcnt lgkmcnt(0)
- s_load_dword s0, s[4:5], 0x24
s_waitcnt lgkmcnt(0)
! flat_store_dword v[0:1], v2
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, vcc_lo
v_mov_b32_e32 v1, vcc_hi
! s_add_u32 m0, s7, 0x200
! v_mov_b32_e32 v2, s0
! flat_store_dword v[0:1], v2
! s_buffer_load_dwordx2 s[2:3], s[8:11], m0 ; 8-byte Folded Reload
! s_load_dword s1, s[4:5], 0x28
! s_mov_b32 m0, s7
! s_load_dword s4, s[4:5], 0x2c
! s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, s2
! v_mov_b32_e32 v1, s3
! v_mov_b32_e32 v2, s1
flat_store_dword v[0:1], v2
- s_buffer_load_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Reload
s_waitcnt vmcnt(0) lgkmcnt(0)
- v_mov_b32_e32 v2, s4
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
flat_store_dword v[0:1], v2
- s_dcache_wb
s_endpgm
.Lfunc_end0:
.size max_12_sgprs, .Lfunc_end0-max_12_sgprs

.section .AMDGPU.csdata
; Kernel info:
! ; codeLenInByte = 256
! ; NumSgprs: 14
! ; NumVgprs: 3
; FloatMode: 192
; IeeeMode: 1
! ; ScratchSize: 20
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
! ; VGPRBlocks: 0
! ; NumSGPRsForWavesPerEU: 14
! ; NumVGPRsForWavesPerEU: 3
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
--- 76,127 ----
runtime_loader_kernel_symbol = 0
.end_amd_kernel_code_t
; BB#0:
! s_load_dwordx2 s[2:3], s[4:5], 0x8
! s_load_dwordx2 s[0:1], s[4:5], 0x0
! s_load_dwordx2 vcc, s[4:5], 0x10
! s_load_dword s6, s[4:5], 0x28
! s_waitcnt lgkmcnt(0)
! v_mov_b32_e32 v2, s2
! v_mov_b32_e32 v3, s3
! s_load_dword s2, s[4:5], 0x20
! s_load_dword s3, s[4:5], 0x24
! v_mov_b32_e32 v0, s0
! v_mov_b32_e32 v1, s1
s_load_dwordx2 s[0:1], s[4:5], 0x18
s_waitcnt lgkmcnt(0)
! v_mov_b32_e32 v4, s2
! s_load_dword s4, s[4:5], 0x2c
! flat_store_dword v[0:1], v4
! s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, s3
! flat_store_dword v[2:3], v0
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, vcc_lo
v_mov_b32_e32 v1, vcc_hi
! v_mov_b32_e32 v2, s6
flat_store_dword v[0:1], v2
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
+ v_mov_b32_e32 v2, s4
flat_store_dword v[0:1], v2
s_endpgm
.Lfunc_end0:
.size max_12_sgprs, .Lfunc_end0-max_12_sgprs

.section .AMDGPU.csdata
; Kernel info:
! ; codeLenInByte = 168
! ; NumSgprs: 9
! ; NumVgprs: 5
; FloatMode: 192
; IeeeMode: 1
! ; ScratchSize: 0
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
! ; VGPRBlocks: 1
! ; NumSGPRsForWavesPerEU: 9
! ; NumVGPRsForWavesPerEU: 5
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
***************
*** 159,165 ****
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
! granulated_workitem_vgpr_count = 0
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
--- 144,150 ----
kernel_code_entry_byte_offset = 256
kernel_code_prefetch_byte_size = 0
max_scratch_backing_memory_byte_size = 0
! granulated_workitem_vgpr_count = 2
granulated_wavefront_sgpr_count = 1
priority = 0
float_mode = 192
***************
*** 167,173 ****
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
! enable_sgpr_private_segment_wave_byte_offset = 1
user_sgpr_count = 12
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
--- 152,158 ----
enable_dx10_clamp = 1
debug_mode = 0
enable_ieee_mode = 1
! enable_sgpr_private_segment_wave_byte_offset = 0
user_sgpr_count = 12
enable_sgpr_workgroup_id_x = 1
enable_sgpr_workgroup_id_y = 0
***************
*** 193,205 ****
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
! workitem_private_segment_byte_size = 40
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 48
workgroup_fbarrier_count = 0
wavefront_sgpr_count = 16
! workitem_vgpr_count = 4
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
--- 178,190 ----
is_dynamic_callstack = 0
is_debug_enabled = 0
is_xnack_enabled = 0
! workitem_private_segment_byte_size = 0
workgroup_group_segment_byte_size = 0
gds_segment_byte_size = 0
kernarg_segment_byte_size = 48
workgroup_fbarrier_count = 0
wavefront_sgpr_count = 16
! workitem_vgpr_count = 11
reserved_vgpr_first = 0
reserved_vgpr_count = 0
reserved_sgpr_first = 0
***************
*** 216,316 ****
; BB#0:
s_mov_b64 s[10:11], s[2:3]
s_mov_b64 s[8:9], s[0:1]
! s_load_dwordx2 s[0:1], s[8:9], 0x8
s_mov_b32 s7, s13
! s_add_u32 m0, s7, 0x700
! s_buffer_store_dwordx2 s[6:7], s[8:11], m0 ; 8-byte Folded Spill
! s_add_u32 m0, s7, 0x500
! s_waitcnt lgkmcnt(0)
! s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
! s_waitcnt lgkmcnt(0)
! s_load_dwordx2 s[0:1], s[8:9], 0x10
! s_add_u32 m0, s7, 0x200
! s_load_dword vcc_lo, s[8:9], 0x2c
v_mov_b32_e32 v0, s10
! v_mov_b32_e32 v2, s12
! s_waitcnt lgkmcnt(0)
! s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
! s_waitcnt lgkmcnt(0)
! s_load_dwordx2 s[0:1], s[8:9], 0x18
! s_mov_b32 m0, s7
v_mov_b32_e32 v1, s11
! v_mov_b32_e32 v3, 0
! s_load_dwordx2 s[2:3], s[8:9], 0x0
! s_load_dword s6, s[8:9], 0x20
! s_nop 0
s_waitcnt lgkmcnt(0)
! s_buffer_store_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Spill
! s_add_u32 m0, s7, 0x400
! s_buffer_store_dword vcc_lo, s[8:11], m0 ; 4-byte Folded Spill
! buffer_store_dword v3, v0, s[8:11], s7 offen
s_nop 0
! flat_store_dword v[0:1], v2
s_nop 0
! flat_store_dword v[0:1], v2
! s_nop 0
! flat_store_dword v[0:1], v2
s_nop 0
flat_store_dwordx2 v[0:1], v[0:1]
s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, s4
! v_mov_b32_e32 v1, s5
! flat_store_dwordx2 v[0:1], v[0:1]
! s_add_u32 m0, s7, 0x700
! s_buffer_load_dwordx2 s[4:5], s[8:11], m0 ; 8-byte Folded Reload
! v_mov_b32_e32 v2, s6
! s_add_u32 m0, s7, 0x500
! s_load_dword s0, s[8:9], 0x24
! s_load_dword s1, s[8:9], 0x28
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s4
v_mov_b32_e32 v1, s5
! flat_store_dwordx2 v[0:1], v[0:1]
! s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, s2
! v_mov_b32_e32 v1, s3
flat_store_dword v[0:1], v2
- s_buffer_load_dwordx2 s[2:3], s[8:11], m0 ; 8-byte Folded Reload
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_mov_b32_e32 v2, s0
- s_add_u32 m0, s7, 0x200
- v_mov_b32_e32 v0, s2
- v_mov_b32_e32 v1, s3
- flat_store_dword v[0:1], v2
- s_buffer_load_dwordx2 s[2:3], s[8:11], m0 ; 8-byte Folded Reload
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_mov_b32_e32 v2, s1
- s_mov_b32 m0, s7
- v_mov_b32_e32 v0, s2
- v_mov_b32_e32 v1, s3
- flat_store_dword v[0:1], v2
- s_buffer_load_dwordx2 s[0:1], s[8:11], m0 ; 8-byte Folded Reload
- s_add_u32 m0, s7, 0x400
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
! s_buffer_load_dword s0, s[8:11], m0 ; 4-byte Folded Reload
! s_waitcnt lgkmcnt(0)
! v_mov_b32_e32 v2, s0
flat_store_dword v[0:1], v2
- s_dcache_wb
s_endpgm
.Lfunc_end1:
.size max_12_sgprs_14_input_sgprs, .Lfunc_end1-max_12_sgprs_14_input_sgprs

.section .AMDGPU.csdata
; Kernel info:
! ; codeLenInByte = 476
; NumSgprs: 16
! ; NumVgprs: 4
; FloatMode: 192
; IeeeMode: 1
! ; ScratchSize: 40
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
! ; VGPRBlocks: 0
; NumSGPRsForWavesPerEU: 16
! ; NumVGPRsForWavesPerEU: 4
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 12
--- 201,275 ----
; BB#0:
s_mov_b64 s[10:11], s[2:3]
s_mov_b64 s[8:9], s[0:1]
! s_load_dwordx2 s[2:3], s[8:9], 0x0
s_mov_b32 s7, s13
! s_mov_b64 s[0:1], s[6:7]
! v_mov_b32_e32 v5, s1
! v_mov_b32_e32 v4, s0
! s_load_dword s0, s[8:9], 0x20
! s_waitcnt lgkmcnt(0)
! v_mov_b32_e32 v7, s3
! s_load_dwordx2 vcc, s[8:9], 0x8
! v_mov_b32_e32 v6, s2
! v_mov_b32_e32 v2, s4
! s_load_dword s2, s[8:9], 0x24
! v_mov_b32_e32 v3, s5
! s_load_dwordx2 s[4:5], s[8:9], 0x10
v_mov_b32_e32 v0, s10
! s_load_dword s3, s[8:9], 0x28
! v_mov_b32_e32 v9, s0
! v_mov_b32_e32 v10, 0
! v_mov_b32_e32 v8, s12
v_mov_b32_e32 v1, s11
! s_load_dwordx2 s[0:1], s[8:9], 0x18
! buffer_store_dword v10, v0, s[8:11], s7 offen
s_waitcnt lgkmcnt(0)
! flat_store_dword v[0:1], v8
s_nop 0
! flat_store_dword v[0:1], v8
s_nop 0
! flat_store_dword v[0:1], v8
s_nop 0
flat_store_dwordx2 v[0:1], v[0:1]
+ s_nop 0
+ flat_store_dwordx2 v[0:1], v[2:3]
+ s_nop 0
+ flat_store_dwordx2 v[0:1], v[4:5]
s_waitcnt vmcnt(0) lgkmcnt(0)
! v_mov_b32_e32 v0, vcc_lo
! s_load_dword s6, s[8:9], 0x2c
! v_mov_b32_e32 v1, vcc_hi
! v_mov_b32_e32 v2, s2
! flat_store_dword v[6:7], v9
! s_nop 0
! flat_store_dword v[0:1], v2
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s4
v_mov_b32_e32 v1, s5
! v_mov_b32_e32 v2, s3
flat_store_dword v[0:1], v2
s_waitcnt vmcnt(0) lgkmcnt(0)
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
! v_mov_b32_e32 v2, s6
flat_store_dword v[0:1], v2
s_endpgm
.Lfunc_end1:
.size max_12_sgprs_14_input_sgprs, .Lfunc_end1-max_12_sgprs_14_input_sgprs

.section .AMDGPU.csdata
; Kernel info:
! ; codeLenInByte = 296
; NumSgprs: 16
! ; NumVgprs: 11
; FloatMode: 192
; IeeeMode: 1
! ; ScratchSize: 0
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
! ; VGPRBlocks: 2
; NumSGPRsForWavesPerEU: 16
! ; NumVGPRsForWavesPerEU: 11
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 12

Event Timeline

vpykhtin created this revision.Jan 21 2017, 6:25 AM
vpykhtin edited the summary of this revision. (Show Details)Jan 21 2017, 6:29 AM
vpykhtin edited the summary of this revision. (Show Details)
arsenm edited edge metadata.Jan 23 2017, 11:25 AM

attr-amdgpu-num-sgpr.ll is a problematic test. The attribute has some problems when you specify a very small number of SGPRs. I'm wondering if we should just remove it and use the more general attribute instead

attr-amdgpu-num-sgpr.ll is a problematic test. The attribute has some problems when you specify a very small number of SGPRs. I'm wondering if we should just remove it and use the more general attribute instead

Should we mark this test as expected fail and submit?

arsenm accepted this revision.Jan 24 2017, 11:21 AM

attr-amdgpu-num-sgpr.ll is a problematic test. The attribute has some problems when you specify a very small number of SGPRs. I'm wondering if we should just remove it and use the more general attribute instead

Should we mark this test as expected fail and submit?

I think that's fine for now

This revision is now accepted and ready to land.Jan 24 2017, 11:21 AM
vpykhtin updated this revision to Diff 85731.Jan 25 2017, 4:25 AM

I updated attr-amdgpu-num-sgpr.ll so it is now passing.

This revision was automatically updated to reflect the committed changes.