diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -6507,11 +6507,27 @@ * FLAT_SCRATCH * EXEC * GFX6-8: M0 - * All SGPR and VGPR registers except the clobbered registers of SGPR4-31 and - VGPR0-31. + * All SGPR registers except the clobbered registers of SGPR4-31. + * VGPR40-47 + VGPR56-63 + VGPR72-79 + VGPR88-95 + VGPR104-111 + VGPR120-127 + VGPR136-143 + VGPR152-159 + VGPR168-175 + VGPR184-191 + VGPR200-207 + VGPR216-223 + VGPR232-239 + VGPR248-255 + *Except the argument registers, the VGPR cloberred and the preserved + registers are intermixed at regular intervals in order to + get a better occupancy.* For the AMDGPU backend, an inter-procedural register allocation (IPRA) - optimization may mark some of clobbered SGPR4-31 and VGPR0-31 registers as + optimization may mark some of clobbered SGPR and VGPR registers as preserved if it can be determined that the called function does not change their value. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -89,6 +89,24 @@ (sequence "VGPR%u", 32, 255) >; +def CSR_AMDGPU_VGPRs : CalleeSavedRegs< + // The CSRs & scratch-registers are interleaved at a split boundary of 8. + (add (sequence "VGPR%u", 40, 47), + (sequence "VGPR%u", 56, 63), + (sequence "VGPR%u", 72, 79), + (sequence "VGPR%u", 88, 95), + (sequence "VGPR%u", 104, 111), + (sequence "VGPR%u", 120, 127), + (sequence "VGPR%u", 136, 143), + (sequence "VGPR%u", 152, 159), + (sequence "VGPR%u", 168, 175), + (sequence "VGPR%u", 184, 191), + (sequence "VGPR%u", 200, 207), + (sequence "VGPR%u", 216, 223), + (sequence "VGPR%u", 232, 239), + (sequence "VGPR%u", 248, 255)) +>; + def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs< (sequence "SGPR%u", 32, 105) >; @@ -104,7 +122,7 @@ >; def CSR_AMDGPU_HighRegs : CalleeSavedRegs< - (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105) + (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105) >; // Calling convention for leaf functions diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -727,9 +727,6 @@ ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GPRIDX-NEXT: s_mov_b32 s18, 0 ; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 -; GPRIDX-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GPRIDX-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GPRIDX-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 ; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000 @@ -793,9 +790,6 @@ ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GPRIDX-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GPRIDX-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GPRIDX-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; @@ -816,9 +810,6 @@ ; MOVREL-NEXT: s_mov_b32 s8, s18 ; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 ; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0 -; MOVREL-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; MOVREL-NEXT: v_mov_b32_e32 v34, s19 ; MOVREL-NEXT: v_mov_b32_e32 v33, s18 ; MOVREL-NEXT: v_mov_b32_e32 v32, s17 @@ -868,10 +859,6 @@ ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; MOVREL-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; MOVREL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; MOVREL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -744,17 +744,13 @@ ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN: s_getpc_b64 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { @@ -766,15 +762,11 @@ ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: ; GCN-NOT: s32 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}} ; GCN: s_getpc_b64 ; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -13,15 +13,15 @@ } ; GCN-LABEL: {{^}}indirect_use_vcc: -; GCN: v_writelane_b32 v32, s33, 2 -; GCN: v_writelane_b32 v32, s30, 0 -; GCN: v_writelane_b32 v32, s31, 1 +; GCN: v_writelane_b32 v40, s33, 2 +; GCN: v_writelane_b32 v40, s30, 0 +; GCN: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s4, v32, 0 -; GCN: v_readlane_b32 s5, v32, 1 -; GCN: v_readlane_b32 s33, v32, 2 +; GCN: v_readlane_b32 s4, v40, 0 +; GCN: v_readlane_b32 s5, v40, 1 +; GCN: v_readlane_b32 s33, v40, 2 ; GCN: ; NumSgprs: 36 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define void @indirect_use_vcc() #1 { call void @use_vcc() ret void @@ -32,7 +32,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 { call void @indirect_use_vcc() ret void @@ -50,7 +50,7 @@ ; GCN-LABEL: {{^}}indirect_use_flat_scratch: ; CI: ; NumSgprs: 38 ; VI: ; NumSgprs: 40 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() ret void @@ -61,7 +61,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 { call void @indirect_use_flat_scratch() ret void @@ -76,7 +76,7 @@ } ; GCN-LABEL: {{^}}indirect_use_10_vgpr: -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define void @indirect_use_10_vgpr() #0 { call void @use_10_vgpr() ret void @@ -84,23 +84,23 @@ ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: ; GCN: is_dynamic_callstack = 0 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { call void @indirect_use_10_vgpr() ret void } -; GCN-LABEL: {{^}}use_40_vgpr: -; GCN: ; NumVgprs: 40 -define void @use_40_vgpr() #1 { - call void asm sideeffect "", "~{v39}"() #0 +; GCN-LABEL: {{^}}use_50_vgpr: +; GCN: ; NumVgprs: 50 +define void @use_50_vgpr() #1 { + call void asm sideeffect "", "~{v49}"() #0 ret void } -; GCN-LABEL: {{^}}indirect_use_40_vgpr: -; GCN: ; NumVgprs: 40 -define void @indirect_use_40_vgpr() #0 { - call void @use_40_vgpr() +; GCN-LABEL: {{^}}indirect_use_50_vgpr: +; GCN: ; NumVgprs: 50 +define void @indirect_use_50_vgpr() #0 { + call void @use_50_vgpr() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -23,22 +23,22 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: buffer_store_dword -; GCN: v_writelane_b32 v32, s33, 4 -; GCN: v_writelane_b32 v32, s34, 0 -; GCN: v_writelane_b32 v32, s35, 1 -; GCN: v_writelane_b32 v32, s30, 2 -; GCN: v_writelane_b32 v32, s31, 3 +; GCN: v_writelane_b32 v40, s33, 4 +; GCN: v_writelane_b32 v40, s34, 0 +; GCN: v_writelane_b32 v40, s35, 1 +; GCN: v_writelane_b32 v40, s30, 2 +; GCN: v_writelane_b32 v40, s31, 3 ; GCN: s_swappc_b64 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s4, v32, 2 -; GCN-DAG: v_readlane_b32 s5, v32, 3 -; GCN: v_readlane_b32 s35, v32, 1 -; GCN: v_readlane_b32 s34, v32, 0 +; GCN-DAG: v_readlane_b32 s4, v40, 2 +; GCN-DAG: v_readlane_b32 s5, v40, 3 +; GCN: v_readlane_b32 s35, v40, 1 +; GCN: v_readlane_b32 s34, v40, 0 -; GCN: v_readlane_b32 s33, v32, 4 +; GCN: v_readlane_b32 s33, v40, 4 ; GCN: buffer_load_dword ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { @@ -49,16 +49,16 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: buffer_store_dword v32 -; GCN: v_writelane_b32 v32, s33, 4 +; GCN: buffer_store_dword v40 +; GCN: v_writelane_b32 v40, s33, 4 ; GCN: s_mov_b32 s33, s32 ; GCN: s_add_u32 s32, s32, 0x400 ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s33, v32, 4 -; GCN: buffer_load_dword v32, +; GCN: v_readlane_b32 s33, v40, 4 +; GCN: buffer_load_dword v40, define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() @@ -115,9 +115,9 @@ } ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: -; GCN: v_mov_b32_e32 v32, v31 +; GCN: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: v_mov_b32_e32 v31, v32 +; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 { %v31 = call i32 asm sideeffect "; def $0", "={v31}"() call void @external_void_func_void() @@ -177,31 +177,31 @@ ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: {{.*}} +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} ; GCN-NOT: v32 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 ; GCN: s_mov_b32 s32, 0 -; GCN-NOT: v32 +; GCN-NOT: v40 ; GCN: ;;#ASMSTART -; GCN-NEXT: ; def v32 +; GCN-NEXT: ; def v40 ; GCN-NEXT: ;;#ASMEND ; GCN: s_swappc_b64 s[30:31], s[4:5] -; GCN-NOT: v32 +; GCN-NOT: v40 ; GCN: ;;#ASMSTART -; GCN-NEXT: ; use v32 +; GCN-NEXT: ; use v40 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm -define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 { - %v32 = call i32 asm sideeffect "; def $0", "={v32}"() +define amdgpu_kernel void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 { + %v40 = call i32 asm sideeffect "; def $0", "={v40}"() call void @external_void_func_void() - call void asm sideeffect "; use $0", "{v32}"(i32 %v32) + call void asm sideeffect "; use $0", "{v40}"(i32 %v40) ret void } @@ -255,12 +255,12 @@ ; GCN-LABEL: {{^}}callee_saved_sgpr_func: ; GCN-NOT: s40 -; GCN: v_writelane_b32 v32, s40 +; GCN: v_writelane_b32 v40, s40 ; GCN: s_swappc_b64 ; GCN-NOT: s40 ; GCN: ; use s40 ; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v32 +; GCN: v_readlane_b32 s40, v40 ; GCN-NOT: s40 define void @callee_saved_sgpr_func() #2 { %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 @@ -287,19 +287,19 @@ ; First call preserved VGPR is used so it can't be used for SGPR spills. ; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func: ; GCN-NOT: s40 -; GCN: v_writelane_b32 v33, s40 +; GCN: v_writelane_b32 v41, s40 ; GCN: s_swappc_b64 ; GCN-NOT: s40 ; GCN: ; use s40 ; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v33 +; GCN: v_readlane_b32 s40, v41 ; GCN-NOT: s40 define void @callee_saved_sgpr_vgpr_func() #2 { %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 - %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 + %v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 - call void asm sideeffect "; use $0", "v"(i32 %v32) #0 + call void asm sideeffect "; use $0", "v"(i32 %v40) #0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -64,11 +64,11 @@ ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NEXT: global_store_dword v[0:1], v32, off +; GCN-NEXT: global_store_dword v[0:1], v40, off ; GCN-NEXT: s_endpgm call void @func(i32 0) store i32 0, i32 addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -127,8 +127,8 @@ ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s4, v32, 0 -; GCN-DAG: v_readlane_b32 s5, v32, 1 +; GCN-DAG: v_readlane_b32 s4, v40, 0 +; GCN-DAG: v_readlane_b32 s5, v40, 1 ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]] @@ -168,6 +168,7 @@ call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0 + call void asm sideeffect "", "~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #0 %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -207,14 +208,14 @@ ; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 ; GCN: ;;#ASMSTART -; GCN-NEXT: ; clobber v33 +; GCN-NEXT: ; clobber v41 ; GCN-NEXT: ;;#ASMEND -; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -223,7 +224,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - call void asm sideeffect "; clobber v33", "~{v33}"() + call void asm sideeffect "; clobber v41", "~{v41}"() ret void } @@ -232,7 +233,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s33, 63 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-COUNT-63: v_writelane_b32 v1 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; GCN: ;;#ASMSTART @@ -246,7 +247,7 @@ define void @last_lane_vgpr_for_fp_csr() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - call void asm sideeffect "; clobber v33", "~{v33}"() + call void asm sideeffect "; clobber v41", "~{v41}"() call void asm sideeffect "", "~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49} ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59} @@ -264,14 +265,14 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-COUNT-64: v_writelane_b32 v1, ; GCN: buffer_store_dword ; GCN: ;;#ASMSTART ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] @@ -280,7 +281,7 @@ define void @no_new_vgpr_for_fp_csr() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - call void asm sideeffect "; clobber v33", "~{v33}"() + call void asm sideeffect "; clobber v41", "~{v41}"() call void asm sideeffect "", "~{s39},~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49} ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59} @@ -347,20 +348,20 @@ ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-DAG: v_writelane_b32 v32, s31, 1 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; GCN-DAG: buffer_store_dword ; GCN: s_add_u32 s32, s32, 0x300{{$}} ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}} -; GCN-NEXT: v_readlane_b32 s33, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -377,11 +378,11 @@ ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} ,~{s30},~{s31}"() #0 - call void asm sideeffect "; clobber nonpreserved VGPRs", + call void asm sideeffect "; clobber nonpreserved initial VGPRs", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} - ,~{v30},~{v31}"() #1 + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1 ret void } @@ -394,19 +395,19 @@ ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-DAG: v_writelane_b32 v32, s31, 1 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}} ; GCN-DAG: buffer_store_dword ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} -; GCN-NEXT: v_readlane_b32 s33, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload @@ -429,7 +430,7 @@ "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} - ,~{v30},~{v31}"() #1 + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -144,7 +144,7 @@ ; GCN-NOT: s12 ; GCN-NOT: s13 ; GCN-NOT: s14 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_x() #1 { call void @use_workgroup_id_x() ret void @@ -152,7 +152,7 @@ ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() ret void @@ -160,7 +160,7 @@ ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -302,7 +302,7 @@ ; Argument is in right place already ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_x() #1 { call void @use_workgroup_id_x() ret void @@ -310,7 +310,7 @@ ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() ret void @@ -318,7 +318,7 @@ ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -396,13 +396,11 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} ; VARABI: v_and_b32_e32 v32, 0x3ff, v32 ; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VARABI-NEXT: s_waitcnt +; VARABI: s_waitcnt ; VARABI-NEXT: s_setpc_b64 ; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 @@ -514,15 +512,15 @@ ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-DAG: buffer_load_dword v32, off, s[0:3], s33{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 -; GCN: buffer_load_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_sub_u32 s32, s32, 0x400{{$}} +; GCN: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -543,13 +541,11 @@ ; frame[2] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VARABI-NEXT: s_waitcnt ; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 ; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}} -; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VARABI: s_setpc_b64 @@ -700,10 +696,7 @@ ret void } -; Only one stack load should be emitted for all 3 values. ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VARABI: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} ; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} ; VARABI-NOT: buffer_load_dword @@ -717,9 +710,7 @@ ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] -; VARABI: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VARABI-NEXT: s_waitcnt +; VARABI: s_waitcnt ; VARABI-NEXT: s_setpc_b64 @@ -826,7 +817,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 -; GCN: ScratchSize: 8 +; GCN: ScratchSize: 0 define void @too_many_args_use_workitem_id_x_stack_yz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -28,23 +28,23 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+4 -; GCN-NEXT: v_writelane_b32 v32, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -62,23 +62,23 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+4 -; GCN-NEXT: v_writelane_b32 v32, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -96,23 +96,23 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+4 -; GCN-NEXT: v_writelane_b32 v32, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -130,24 +130,24 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s33, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+4 -; GCN-NEXT: v_writelane_b32 v32, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll --- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll @@ -3,7 +3,7 @@ ; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}} define void @csr() #0 { - call void asm sideeffect "", "~{v0},~{v36},~{v37}"() #0 + call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -187,44 +187,44 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v35, s33, 4 +; GFX9-NEXT: v_writelane_b32 v43, s33, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v35, s34, 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v43, s34, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+4 -; GFX9-NEXT: v_writelane_b32 v35, s35, 1 +; GFX9-NEXT: v_writelane_b32 v43, s35, 1 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v32, v1 -; GFX9-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-NEXT: v_writelane_b32 v35, s30, 2 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v33, v32 -; GFX9-NEXT: v_writelane_b32 v35, s31, 3 -; GFX9-NEXT: v_and_b32_e32 v34, 0xffffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v40, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_writelane_b32 v43, s30, 2 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 +; GFX9-NEXT: v_writelane_b32 v43, s31, 3 +; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mad_u32_u24 v32, v33, v32, v34 -; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_add_u32_e32 v0, v32, v34 +; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s4, v35, 2 -; GFX9-NEXT: v_readlane_b32 s5, v35, 3 -; GFX9-NEXT: v_readlane_b32 s35, v35, 1 -; GFX9-NEXT: v_readlane_b32 s34, v35, 0 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v43, 2 +; GFX9-NEXT: v_readlane_b32 s5, v43, 3 +; GFX9-NEXT: v_readlane_b32 s35, v43, 1 +; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 -; GFX9-NEXT: v_readlane_b32 s33, v35, 4 +; GFX9-NEXT: v_readlane_b32 s33, v43, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -12,23 +12,23 @@ ; Spill CSR VGPR used for SGPR spilling ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v32, s33, 2 +; GCN-DAG: v_writelane_b32 v40, s33, 2 ; GCN-DAG: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: v_writelane_b32 v32, s30, 0 -; GCN-DAG: v_writelane_b32 v32, s31, 1 +; GCN-DAG: v_writelane_b32 v40, s30, 0 +; GCN-DAG: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s4, v32, 0 -; GCN: v_readlane_b32 s5, v32, 1 +; GCN: v_readlane_b32 s4, v40, 0 +; GCN: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s33, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -254,7 +254,7 @@ ... # GCN-LABEL: csr{{$}} -# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, +# GCN: V_AND_B32_e32 $vgpr37, $vgpr0, --- name: csr tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -152,9 +152,6 @@ ; FIXME: Why load and store same location for stack args? ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill - ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 @@ -163,9 +160,6 @@ ; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload - ; GCN-NOT: s32 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { @@ -176,7 +170,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:40 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -203,15 +197,15 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec ; GCN: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v34, s34, 0 -; GCN-DAG: v_writelane_b32 v34, s35, 1 +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v42, s34, 0 +; GCN-DAG: v_writelane_b32 v42, s35, 1 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -220,11 +214,11 @@ ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s34, v34, 0 -; GCN-DAG: v_readlane_b32 s35, v34, 1 +; GCN-DAG: v_readlane_b32 s34, v42, 0 +; GCN-DAG: v_readlane_b32 s35, v42, 1 -; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 @@ -233,7 +227,7 @@ ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s33, ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { @@ -248,11 +242,11 @@ ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: ; GCN-NOT: s33 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: ; GCN-NOT: s33 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: ; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -2,17 +2,17 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v32, s33, 2 +; GCN: v_writelane_b32 v40, s33, 2 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 s33, v32, 2 +; GCN: v_readlane_b32 s33, v40, 2 ; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -29,7 +29,7 @@ ; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] ; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] ; GCN-NEXT: s_mov_b32 s32, 0xc0000 -; GCN-NEXT: v_add_nc_u32_e64 v32, 4, 0x4000 +; GCN-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 ; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 @@ -41,8 +41,8 @@ ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz BB0_2 ; GCN-NEXT: ; %bb.1: ; %if.then4.i -; GCN-NEXT: buffer_load_dword v0, v32, s[36:39], s32 offen -; GCN-NEXT: buffer_load_dword v1, v32, s[36:39], s32 offen offset:4 +; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], s32 offen +; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], s32 offen offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -0,0 +1,170 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + +declare void @extern_func() + +define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { +; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be +; preserved across the call and should get 8 scratch registers. + +; GFX9-LABEL: non_preserved_vgpr_tuple8: +; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill + +; GFX9: v_mov_b32_e32 v37, v11 +; GFX9-NEXT: v_mov_b32_e32 v38, v10 +; GFX9-NEXT: v_mov_b32_e32 v49, v9 +; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v36, v16 +; GFX9-NEXT: v_mov_b32_e32 v35, v15 +; GFX9-NEXT: v_mov_b32_e32 v34, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v13 +; GFX9-NEXT: v_mov_b32_e32 v32, v12 + +; GFX9: ;;#ASMSTART +; GFX9-NEXT: ;;#ASMEND + +; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] + +; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: non_preserved_vgpr_tuple8: +; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill + +; GFX10: v_mov_b32_e32 v36, v16 +; GFX10-NEXT: v_mov_b32_e32 v35, v15 +; GFX10-NEXT: v_mov_b32_e32 v34, v14 +; GFX10-NEXT: v_mov_b32_e32 v33, v13 +; GFX10-NEXT: v_mov_b32_e32 v32, v12 + +; GFX10: ;;#ASMSTART +; GFX10-NEXT: ;;#ASMEND + +; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1 +; GFX10-NEXT: v_nop +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] + +; GFX10: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload + +; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10: s_setpc_b64 s[4:5] +main_body: + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 + call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 + call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 + call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0 + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + call void @extern_func() + ret <4 x float> %v +} + +define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { +; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved +; across the call and should get allcoated to 8 CSRs. +; Only the lower 5 sub-registers of the tuple are preserved. +; The upper 3 sub-registers are unused. + +; GFX9-LABEL: call_preserved_vgpr_tuple8: +; GFX9: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill + +; GFX9: v_mov_b32_e32 v44, v16 +; GFX9-NEXT: v_mov_b32_e32 v43, v15 +; GFX9-NEXT: v_mov_b32_e32 v42, v14 +; GFX9-NEXT: v_mov_b32_e32 v41, v13 +; GFX9-NEXT: v_mov_b32_e32 v40, v12 + +; GFX9: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1 + +; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload + +; GFX9: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: call_preserved_vgpr_tuple8: +; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill + +; GFX10: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX10-NEXT: v_mov_b32_e32 v40, v16 +; GFX10-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[44:47] dmask:0x1 +; GFX10-NEXT: v_mov_b32_e32 v42, v14 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v43, v13 +; GFX10-NEXT: v_mov_b32_e32 v44, v12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[44:47] dmask:0x1 + +; GFX10: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX10: s_setpc_b64 s[4:5] +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + store <4 x float> %v, <4 x float> addrspace(1)* undef + call void @extern_func() + %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + ret <4 x float> %v1 +} + +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 + +attributes #0 = { nounwind writeonly } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -29,7 +29,7 @@ body: | bb.0: ; CHECK-LABEL: name: undef_identity_copy - ; CHECK: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) + ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 ; CHECK: $sgpr4 = COPY $sgpr95 @@ -38,9 +38,9 @@ ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @bar + 4, target-flags(amdgpu-rel32-hi) @bar + 4, implicit-def dead $scc ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 ; CHECK: $sgpr4 = COPY $sgpr95 - ; CHECK: $vgpr0 = COPY renamable $vgpr32 - ; CHECK: $vgpr1 = COPY renamable $vgpr33 - ; CHECK: $vgpr2 = COPY renamable $vgpr34 + ; CHECK: $vgpr0 = COPY renamable $vgpr40 + ; CHECK: $vgpr1 = COPY renamable $vgpr41 + ; CHECK: $vgpr2 = COPY renamable $vgpr42 ; CHECK: $vgpr3 = KILL undef renamable $vgpr3 ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @bar, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0 ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1058,30 +1058,30 @@ ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: v_nop ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s33, 2 +; GCN-NEXT: v_writelane_b32 v40, s33, 2 ; GCN: s_mov_b32 s33, s32 ; GFX1064: s_add_u32 s32, s32, 0x400 ; GFX1032: s_add_u32 s32, s32, 0x200 -; GCN-DAG: v_writelane_b32 v32, s30, 0 -; GCN-DAG: v_writelane_b32 v32, s31, 1 +; GCN-DAG: v_writelane_b32 v40, s30, 0 +; GCN-DAG: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s4, v32, 0 -; GCN-DAG: v_readlane_b32 s5, v32, 1 +; GCN-DAG: v_readlane_b32 s4, v40, 0 +; GCN-DAG: v_readlane_b32 s5, v40, 1 ; GFX1064: s_sub_u32 s32, s32, 0x400 ; GFX1032: s_sub_u32 s32, s32, 0x200 -; GCN: v_readlane_b32 s33, v32, 2 +; GCN: v_readlane_b32 s33, v40, 2 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: v_nop ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]