diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -156,7 +156,7 @@ ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: no_free_vgprs_at_agpr_to_agpr_copy +; GFX90A-LABEL: no_free_vgprs_at_agpr_to_agpr_copy: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v33, v0 @@ -927,88 +927,88 @@ ; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31] ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a0 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a0 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a3 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a3 ; Reload Reuse ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a4 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a4 ; Reload Reuse ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a5 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a5 ; Reload Reuse ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a6 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a6 ; Reload Reuse ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a7 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a7 ; Reload Reuse ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a8 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a8 ; Reload Reuse ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a9 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a9 ; Reload Reuse ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX908-NEXT: v_accvgpr_read_b32 v32, a10 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v32, a10 ; Reload Reuse ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; copy +; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a32, v32 ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a0, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a0, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a1, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a1, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a2, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a2, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a4, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a4, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a5, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a5, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a6, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a6, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a7, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a7, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a8, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a8, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a9, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a9, v32 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a10, v32 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse -; GFX908-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a10, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse ; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; copy +; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_accvgpr_read_b32 v33, a2 ; GFX908-NEXT: s_nop 1 @@ -1042,7 +1042,7 @@ ; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 -; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] ; GFX90A-NEXT: s_nop 7 @@ -1059,16 +1059,16 @@ ; GFX90A-NEXT: buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX90A-NEXT: buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX90A-NEXT: buffer_store_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; copy +; GFX90A-NEXT: ; copy ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a1 -; GFX90A-NEXT: buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX90A-NEXT: s_nop 0 @@ -1090,19 +1090,19 @@ ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; copy +; GFX90A-NEXT: ; copy ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a3 v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v34 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v34 ; Reload Reuse ; GFX90A-NEXT: s_setpc_b64 s[30:31] %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${s[0:15]}"() %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-usage-should-fail-on-gfx900.ll b/llvm/test/CodeGen/AMDGPU/agpr-usage-should-fail-on-gfx900.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/agpr-usage-should-fail-on-gfx900.ll @@ -0,0 +1,86 @@ +; RUN: not llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX900 %s + +; AGPRs are not avaialble for GFX900, and hence they are all reserved. Hence compilation of program for gfx900 +; which uses AGPRs should fail. + +; GFX900: couldn't allocate input reg for constraint 'a' + +; GFX900-LABEL: {{^}}max_10_vgprs_used_9a: +; GFX900: NumVgprs: 10 +; GFX900: ScratchSize: 12 +; GFX900: VGPRBlocks: 2 +; GFX900: NumVGPRsForWavesPerEU: 10 +define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 { + %tid = load volatile i32, i32 addrspace(1)* undef + call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid + %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4 + %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8 + %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12 + %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16 + %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20 + %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24 + %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28 + %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32 + %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36 + %v1 = load volatile i32, i32 addrspace(1)* %p1 + %v2 = load volatile i32, i32 addrspace(1)* %p2 + %v3 = load volatile i32, i32 addrspace(1)* %p3 + %v4 = load volatile i32, i32 addrspace(1)* %p4 + %v5 = load volatile i32, i32 addrspace(1)* %p5 + %v6 = load volatile i32, i32 addrspace(1)* %p6 + %v7 = load volatile i32, i32 addrspace(1)* %p7 + %v8 = load volatile i32, i32 addrspace(1)* %p8 + %v9 = load volatile i32, i32 addrspace(1)* %p9 + %v10 = load volatile i32, i32 addrspace(1)* %p10 + call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10) + store volatile i32 %v1, i32 addrspace(1)* undef + store volatile i32 %v2, i32 addrspace(1)* undef + store volatile i32 %v3, i32 addrspace(1)* undef + store volatile i32 %v4, i32 addrspace(1)* undef + store volatile i32 %v5, i32 addrspace(1)* undef + store volatile i32 %v6, i32 addrspace(1)* undef + store volatile i32 %v7, i32 addrspace(1)* undef + store volatile i32 %v8, i32 addrspace(1)* undef + store volatile i32 %v9, i32 addrspace(1)* undef + store volatile i32 %v10, i32 addrspace(1)* undef + ret void +} + +; GFX900-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill: +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX900-DAG: buffer_store_dword v{{[0-9]}}, +; GFX900-DAG: buffer_store_dword v{{[0-9]}}, +; GFX900-DAG: buffer_load_dword v{{[0-9]}}, +; GFX900-DAG: buffer_load_dword v{{[0-9]}}, + +; GFX900: NumVgprs: 10 +; GFX900: ScratchSize: 44 +; GFX900: VGPRBlocks: 2 +; GFX900: NumVGPRsForWavesPerEU: 10 +define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 { + %tid = load volatile i32, i32 addrspace(1)* undef + call void asm sideeffect "", "a"(i32 1) + %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid + %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8 + %p3 = getelementptr inbounds i64, i64 addrspace(1)* %p2, i32 16 + %p4 = getelementptr inbounds i64, i64 addrspace(1)* %p3, i32 24 + %p5 = getelementptr inbounds i64, i64 addrspace(1)* %p4, i32 32 + %v1 = load volatile i64, i64 addrspace(1)* %p1 + %v2 = load volatile i64, i64 addrspace(1)* %p2 + %v3 = load volatile i64, i64 addrspace(1)* %p3 + %v4 = load volatile i64, i64 addrspace(1)* %p4 + %v5 = load volatile i64, i64 addrspace(1)* %p5 + call void asm sideeffect "", "v,v,v,v,v"(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5) + store volatile i64 %v1, i64 addrspace(1)* %p2 + store volatile i64 %v2, i64 addrspace(1)* %p3 + store volatile i64 %v3, i64 addrspace(1)* %p4 + store volatile i64 %v4, i64 addrspace(1)* %p5 + store volatile i64 %v5, i64 addrspace(1)* %p1 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +attributes #0 = { nounwind "amdgpu-num-vgpr"="10" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr-gfx908.ll rename from llvm/test/CodeGen/AMDGPU/spill-agpr.ll rename to llvm/test/CodeGen/AMDGPU/spill-agpr-gfx908.ll --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr-gfx908.ll @@ -1,14 +1,13 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s -; GCN-LABEL: {{^}}max_12regs_13a_used: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} -; GCN-NOT: buffer_store_dword -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] -; GCN: ScratchSize: 0 +; GFX908-LABEL: {{^}}max_12regs_13a_used: +; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} +; GFX908-NOT: buffer_store_dword +; GFX908-NOT: buffer_load_dword +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; GFX908: ScratchSize: 0 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -29,14 +28,14 @@ ret void } -; GCN-LABEL: {{^}}max_10_vgprs_used_9a: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} -; GCN-NOT: buffer_store_dword -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] -; GCN: ScratchSize: 0 +; GFX908-LABEL: {{^}}max_10_vgprs_used_9a: +; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} +; GFX908-NOT: buffer_store_dword +; GFX908-NOT: buffer_load_dword +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; GFX908: ScratchSize: 0 define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { %a1 = call <4 x i32> asm sideeffect "", "=a"() %a2 = call <4 x i32> asm sideeffect "", "=a"() @@ -47,15 +46,15 @@ ret void } -; GCN-LABEL: {{^}}max_32regs_mfma32: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN-NOT: buffer_store_dword -; GCN: v_accvgpr_read_b32 -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 -; GCN: ScratchSize: 0 +; GFX908-LABEL: {{^}}max_32regs_mfma32: +; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-NOT: buffer_store_dword +; GFX908: v_accvgpr_read_b32 +; GFX908: v_mfma_f32_32x32x1f32 +; GFX908-NOT: buffer_load_dword +; GFX908: v_accvgpr_write_b32 +; GFX908: ScratchSize: 0 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { bb: %v = call i32 asm sideeffect "", "=a"() @@ -70,9 +69,9 @@ } ; Should spill agprs to memory for both gfx908 and gfx90a. -; GCN-LABEL: {{^}}max_5regs_used_8a: -; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-LABEL: {{^}}max_5regs_used_8a: +; GFX908: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX908: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX908-DAG: v_accvgpr_read_b32 v32, a0 ; Reload Reuse ; GFX908-DAG: buffer_store_dword v32, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill @@ -83,12 +82,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 v32, a3 ; Reload Reuse ; GFX908-DAG: buffer_store_dword v32, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill -; GFX90A-DAG: v_accvgpr_read_b32 v4, a2 ; Reload Reuse -; GFX90A-DAG: v_accvgpr_read_b32 v3, a3 ; Reload Reuse - -; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; GFX908: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] ; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload ; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload @@ -96,13 +90,7 @@ ; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload ; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off -; GFX90A-DAG: buffer_load_dword a0, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload -; GFX90A-DAG: buffer_load_dword a1, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload -; GFX90A-DAG: v_accvgpr_write_b32 a2, v4 ; Reload Reuse -; GFX90A-DAG: v_accvgpr_write_b32 a3, v3 ; Reload Reuse -; GFX90A: global_store_dwordx4 v[0:1], a[0:3], off - -; GCN: ScratchSize: 20 +; GFX908: ScratchSize: 20 define amdgpu_kernel void @max_5regs_used_8a(<4 x float> addrspace(1)* %arg) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %v0 = call float asm sideeffect "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr-gfx90a.ll rename from llvm/test/CodeGen/AMDGPU/spill-agpr.ll rename to llvm/test/CodeGen/AMDGPU/spill-agpr-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr-gfx90a.ll @@ -1,14 +1,13 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; GCN-LABEL: {{^}}max_12regs_13a_used: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} -; GCN-NOT: buffer_store_dword -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] -; GCN: ScratchSize: 0 +; GFX90A-LABEL: {{^}}max_12regs_13a_used: +; GFX90A-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX90A-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX90A: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} +; GFX90A-NOT: buffer_store_dword +; GFX90A-NOT: buffer_load_dword +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; GFX90A: ScratchSize: 0 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -29,14 +28,14 @@ ret void } -; GCN-LABEL: {{^}}max_10_vgprs_used_9a: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} -; GCN-NOT: buffer_store_dword -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] -; GCN: ScratchSize: 0 +; GFX90A-LABEL: {{^}}max_10_vgprs_used_9a: +; GFX90A-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX90A-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX90A: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} +; GFX90A-NOT: buffer_store_dword +; GFX90A-NOT: buffer_load_dword +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; GFX90A: ScratchSize: 0 define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { %a1 = call <4 x i32> asm sideeffect "", "=a"() %a2 = call <4 x i32> asm sideeffect "", "=a"() @@ -47,15 +46,15 @@ ret void } -; GCN-LABEL: {{^}}max_32regs_mfma32: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN-NOT: buffer_store_dword -; GCN: v_accvgpr_read_b32 -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 -; GCN: ScratchSize: 0 +; GFX90A-LABEL: {{^}}max_32regs_mfma32: +; GFX90A-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX90A-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX90A-NOT: buffer_store_dword +; GFX90A: v_accvgpr_read_b32 +; GFX90A: v_mfma_f32_32x32x1f32 +; GFX90A-NOT: buffer_load_dword +; GFX90A: v_accvgpr_write_b32 +; GFX90A: ScratchSize: 0 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { bb: %v = call i32 asm sideeffect "", "=a"() @@ -70,39 +69,21 @@ } ; Should spill agprs to memory for both gfx908 and gfx90a. -; GCN-LABEL: {{^}}max_5regs_used_8a: -; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 - -; GFX908-DAG: v_accvgpr_read_b32 v32, a0 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v32, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill -; GFX908-DAG: v_accvgpr_read_b32 v32, a1 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v32, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill -; GFX908-DAG: v_accvgpr_read_b32 v32, a2 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v32, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill -; GFX908-DAG: v_accvgpr_read_b32 v32, a3 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v32, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill - +; GFX90A-LABEL: {{^}}max_5regs_used_8a: +; GFX90A: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX90A: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill ; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill ; GFX90A-DAG: v_accvgpr_read_b32 v4, a2 ; Reload Reuse ; GFX90A-DAG: v_accvgpr_read_b32 v3, a3 ; Reload Reuse - -; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] - -; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload -; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off - +; GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] ; GFX90A-DAG: buffer_load_dword a0, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload ; GFX90A-DAG: buffer_load_dword a1, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload ; GFX90A-DAG: v_accvgpr_write_b32 a2, v4 ; Reload Reuse ; GFX90A-DAG: v_accvgpr_write_b32 a3, v3 ; Reload Reuse ; GFX90A: global_store_dwordx4 v[0:1], a[0:3], off -; GCN: ScratchSize: 20 +; GFX90A: ScratchSize: 20 define amdgpu_kernel void @max_5regs_used_8a(<4 x float> addrspace(1)* %arg) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %v0 = call float asm sideeffect "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-on-gfx900.ll rename from llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll rename to llvm/test/CodeGen/AMDGPU/spill-vgpr-on-gfx900.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-on-gfx900.ll @@ -1,27 +1,17 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s -; RUN: not llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX900 %s -; GFX900: couldn't allocate input reg for constraint 'a' - - -; GCN-LABEL: {{^}}max_10_vgprs: +; GFX900-LABEL: {{^}}max_10_vgprs: ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}} ; GFX900: buffer_store_dword v{{[0-9]}}, ; GFX900: buffer_store_dword v{{[0-9]}}, ; GFX900: buffer_load_dword v{{[0-9]}}, ; GFX900: buffer_load_dword v{{[0-9]}}, -; GFX908-NOT: buffer_ -; GFX908-DAG: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]] -; GFX908-DAG: v_accvgpr_read_b32 [[V_REG]], [[A_REG]] -; GCN: NumVgprs: 10 +; GFX900: NumVgprs: 10 ; GFX900: ScratchSize: 12 -; GFX908: ScratchSize: 0 -; GCN: VGPRBlocks: 2 -; GCN: NumVGPRsForWavesPerEU: 10 +; GFX900: VGPRBlocks: 2 +; GFX900: NumVGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 { %tid = load volatile i32, i32 addrspace(1)* undef %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid @@ -58,124 +48,11 @@ ret void } -; GCN-LABEL: {{^}}max_10_vgprs_used_9a: -; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}} -; GFX908-NOT: buffer_store_dword v{{[0-9]}}, -; GFX908-NOT: buffer_ -; GFX908: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]] -; GFX908: v_accvgpr_read_b32 [[V_REG]], [[A_REG]] -; GFX908-NOT: buffer_ - -; GFX908: NumVgprs: 10 -; GFX908: ScratchSize: 0 -; GFX908: VGPRBlocks: 2 -; GFX908: NumVGPRsForWavesPerEU: 10 -define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 { - %tid = load volatile i32, i32 addrspace(1)* undef - call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) - %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid - %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4 - %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8 - %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12 - %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16 - %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20 - %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24 - %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28 - %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32 - %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36 - %v1 = load volatile i32, i32 addrspace(1)* %p1 - %v2 = load volatile i32, i32 addrspace(1)* %p2 - %v3 = load volatile i32, i32 addrspace(1)* %p3 - %v4 = load volatile i32, i32 addrspace(1)* %p4 - %v5 = load volatile i32, i32 addrspace(1)* %p5 - %v6 = load volatile i32, i32 addrspace(1)* %p6 - %v7 = load volatile i32, i32 addrspace(1)* %p7 - %v8 = load volatile i32, i32 addrspace(1)* %p8 - %v9 = load volatile i32, i32 addrspace(1)* %p9 - %v10 = load volatile i32, i32 addrspace(1)* %p10 - call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10) - store volatile i32 %v1, i32 addrspace(1)* undef - store volatile i32 %v2, i32 addrspace(1)* undef - store volatile i32 %v3, i32 addrspace(1)* undef - store volatile i32 %v4, i32 addrspace(1)* undef - store volatile i32 %v5, i32 addrspace(1)* undef - store volatile i32 %v6, i32 addrspace(1)* undef - store volatile i32 %v7, i32 addrspace(1)* undef - store volatile i32 %v8, i32 addrspace(1)* undef - store volatile i32 %v9, i32 addrspace(1)* undef - store volatile i32 %v10, i32 addrspace(1)* undef - ret void -} - -; GCN-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill: -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-DAG: v_accvgpr_write_b32 a0, 1 -; GCN-DAG: buffer_store_dword v{{[0-9]}}, -; GCN-DAG: buffer_store_dword v{{[0-9]}}, -; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} -; GCN-DAG: buffer_load_dword v{{[0-9]}}, -; GCN-DAG: buffer_load_dword v{{[0-9]}}, -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 - -; GCN: NumVgprs: 10 -; GFX900: ScratchSize: 44 -; GFX908: ScratchSize: 12 -; GCN: VGPRBlocks: 2 -; GCN: NumVGPRsForWavesPerEU: 10 -define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 { - %tid = load volatile i32, i32 addrspace(1)* undef - call void asm sideeffect "", "a"(i32 1) - %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid - %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8 - %p3 = getelementptr inbounds i64, i64 addrspace(1)* %p2, i32 16 - %p4 = getelementptr inbounds i64, i64 addrspace(1)* %p3, i32 24 - %p5 = getelementptr inbounds i64, i64 addrspace(1)* %p4, i32 32 - %v1 = load volatile i64, i64 addrspace(1)* %p1 - %v2 = load volatile i64, i64 addrspace(1)* %p2 - %v3 = load volatile i64, i64 addrspace(1)* %p3 - %v4 = load volatile i64, i64 addrspace(1)* %p4 - %v5 = load volatile i64, i64 addrspace(1)* %p5 - call void asm sideeffect "", "v,v,v,v,v"(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5) - store volatile i64 %v1, i64 addrspace(1)* %p2 - store volatile i64 %v2, i64 addrspace(1)* %p3 - store volatile i64 %v3, i64 addrspace(1)* %p4 - store volatile i64 %v4, i64 addrspace(1)* %p5 - store volatile i64 %v5, i64 addrspace(1)* %p1 - ret void -} - -; GCN-LABEL: {{^}}max_10_vgprs_spill_v32: -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN: buffer_store_dword v{{[0-9]}}, -; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} -; GCN-NOT: a10 - -; GFX908: NumVgprs: 10 -; GFX900: ScratchSize: 100 -; GFX908: ScratchSize: 68 -; GFX908: VGPRBlocks: 2 -; GFX908: NumVGPRsForWavesPerEU: 10 +; GFX900-LABEL: {{^}}max_10_vgprs_spill_v32: +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX900: buffer_store_dword v{{[0-9]}}, +; GFX900-NOT: a10 define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid @@ -184,23 +61,16 @@ ret void } -; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32: +; GFX900-LABEL: {{^}}max_256_vgprs_spill_9x32: ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_write_b32 a0, v ; GFX900: buffer_store_dword v ; GFX900: buffer_load_dword v -; GFX908-NOT: buffer_ -; GFX908-DAG: v_accvgpr_read_b32 ; GFX900: NumVgprs: 256 ; GFX900: ScratchSize: 148 -; GFX908: NumVgprs: 255 -; GFX908: ScratchSize: 0 -; GCN: VGPRBlocks: 63 -; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 255 +; GFX900: VGPRBlocks: 63 +; GFX900: NumVGPRsForWavesPerEU: 256 define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid @@ -233,24 +103,16 @@ ret void } -; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb: +; GFX900-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb: ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-NOT: SCRATCH_RSRC -; GFX908: v_accvgpr_write_b32 -; GFX908: global_load_ ; GFX900: buffer_store_dword v ; GFX900: buffer_load_dword v -; GFX908-NOT: buffer_ -; GFX908-DAG: v_accvgpr_read_b32 -; GFX900: NumVgprs: 256 -; GFX908: NumVgprs: 253 +; GFX900: NumVgprs: 256 ; GFX900: ScratchSize: 2052 -; GFX908: ScratchSize: 0 -; GCN: VGPRBlocks: 63 -; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 253 +; GFX900: VGPRBlocks: 63 +; GFX900: NumVGPRsForWavesPerEU: 256 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid @@ -288,11 +150,6 @@ ; Make sure there's no crash when we have loads from fixed stack ; objects and are processing VGPR spills - -; GCN-LABEL: {{^}}stack_args_vgpr_spill: -; GFX908: v_accvgpr_write_b32 -; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 -; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 define void @stack_args_vgpr_spill(<32 x float> %arg0, <32 x float> %arg1, <32 x float> addrspace(1)* %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid @@ -324,7 +181,6 @@ ret void } - declare i32 @llvm.amdgcn.workitem.id.x() attributes #0 = { nounwind "amdgpu-num-vgpr"="10" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-gfx908.ll rename from llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll rename to llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-gfx908.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-gfx908.ll @@ -1,27 +1,16 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s -; RUN: not llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s -; GFX900: couldn't allocate input reg for constraint 'a' - - -; GCN-LABEL: {{^}}max_10_vgprs: -; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-LABEL: {{^}}max_10_vgprs: ; GFX908-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}} -; GFX900: buffer_store_dword v{{[0-9]}}, -; GFX900: buffer_store_dword v{{[0-9]}}, -; GFX900: buffer_load_dword v{{[0-9]}}, -; GFX900: buffer_load_dword v{{[0-9]}}, ; GFX908-NOT: buffer_ ; GFX908-DAG: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]] ; GFX908-DAG: v_accvgpr_read_b32 [[V_REG]], [[A_REG]] -; GCN: NumVgprs: 10 -; GFX900: ScratchSize: 12 +; GFX908: NumVgprs: 10 ; GFX908: ScratchSize: 0 -; GCN: VGPRBlocks: 2 -; GCN: NumVGPRsForWavesPerEU: 10 +; GFX908: VGPRBlocks: 2 +; GFX908: NumVGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 { %tid = load volatile i32, i32 addrspace(1)* undef %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid @@ -58,7 +47,7 @@ ret void } -; GCN-LABEL: {{^}}max_10_vgprs_used_9a: +; GFX908-LABEL: {{^}}max_10_vgprs_used_9a: ; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}} @@ -109,12 +98,12 @@ ret void } -; GCN-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill: -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill: +; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX908-DAG: v_accvgpr_write_b32 a0, 1 -; GCN-DAG: buffer_store_dword v{{[0-9]}}, -; GCN-DAG: buffer_store_dword v{{[0-9]}}, +; GFX908-DAG: buffer_store_dword v{{[0-9]}}, +; GFX908-DAG: buffer_store_dword v{{[0-9]}}, ; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} ; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} @@ -124,8 +113,8 @@ ; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} ; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} ; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} -; GCN-DAG: buffer_load_dword v{{[0-9]}}, -; GCN-DAG: buffer_load_dword v{{[0-9]}}, +; GFX908-DAG: buffer_load_dword v{{[0-9]}}, +; GFX908-DAG: buffer_load_dword v{{[0-9]}}, ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 @@ -136,11 +125,10 @@ ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 -; GCN: NumVgprs: 10 -; GFX900: ScratchSize: 44 +; GFX908: NumVgprs: 10 ; GFX908: ScratchSize: 12 -; GCN: VGPRBlocks: 2 -; GCN: NumVGPRsForWavesPerEU: 10 +; GFX908: VGPRBlocks: 2 +; GFX908: NumVGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 { %tid = load volatile i32, i32 addrspace(1)* undef call void asm sideeffect "", "a"(i32 1) @@ -163,16 +151,15 @@ ret void } -; GCN-LABEL: {{^}}max_10_vgprs_spill_v32: -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN: buffer_store_dword v{{[0-9]}}, +; GFX908-LABEL: {{^}}max_10_vgprs_spill_v32: +; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908: buffer_store_dword v{{[0-9]}}, ; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} -; GCN-NOT: a10 +; GFX908-NOT: a10 ; GFX908: NumVgprs: 10 -; GFX900: ScratchSize: 100 ; GFX908: ScratchSize: 68 ; GFX908: VGPRBlocks: 2 ; GFX908: NumVGPRsForWavesPerEU: 10 @@ -184,23 +171,16 @@ ret void } -; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32: -; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-LABEL: {{^}}max_256_vgprs_spill_9x32: ; GFX908-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_write_b32 a0, v -; GFX900: buffer_store_dword v -; GFX900: buffer_load_dword v ; GFX908-NOT: buffer_ ; GFX908-DAG: v_accvgpr_read_b32 -; GFX900: NumVgprs: 256 -; GFX900: ScratchSize: 148 ; GFX908: NumVgprs: 255 ; GFX908: ScratchSize: 0 -; GCN: VGPRBlocks: 63 -; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 255 +; GFX908: VGPRBlocks: 63 +; GFX908: NumVGPRsForWavesPerEU: 255 define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid @@ -233,24 +213,17 @@ ret void } -; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb: -; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb: ; GFX908-NOT: SCRATCH_RSRC ; GFX908: v_accvgpr_write_b32 ; GFX908: global_load_ -; GFX900: buffer_store_dword v -; GFX900: buffer_load_dword v ; GFX908-NOT: buffer_ ; GFX908-DAG: v_accvgpr_read_b32 -; GFX900: NumVgprs: 256 -; GFX908: NumVgprs: 253 -; GFX900: ScratchSize: 2052 +; GFX908: NumVgprs: 253 ; GFX908: ScratchSize: 0 -; GCN: VGPRBlocks: 63 -; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 253 +; GFX908: VGPRBlocks: 63 +; GFX908: NumVGPRsForWavesPerEU: 253 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid @@ -289,7 +262,7 @@ ; Make sure there's no crash when we have loads from fixed stack ; objects and are processing VGPR spills -; GCN-LABEL: {{^}}stack_args_vgpr_spill: +; GFX908-LABEL: {{^}}stack_args_vgpr_spill: ; GFX908: v_accvgpr_write_b32 ; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 ; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 @@ -324,7 +297,6 @@ ret void } - declare i32 @llvm.amdgcn.workitem.id.x() attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }