Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -144,6 +144,8 @@ case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_ACCVGPR_READ_B32: + case AMDGPU::V_ACCVGPR_WRITE_B32: // No implicit operands. return MI.getNumOperands() == MI.getDesc().getNumOperands(); default: Index: llvm/lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -366,10 +366,13 @@ def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI; let Predicates = [HasMAIInsts] in { + +let isAsCheapAsAMove = 1, isReMaterializable = 1 in { def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> { let isMoveImm = 1; } +} // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { Index: llvm/test/CodeGen/AMDGPU/agpr-remat.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/agpr-remat.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s + +; Make sure there are no v_accvgpr_read_b32 copying back and forth +; between AGPR and VGPR. +define amdgpu_kernel void @remat_constant_voids_spill(i32 addrspace(1)* %p) #1 { +; GFX908-LABEL: remat_constant_voids_spill: +; GFX908: ; %bb.0: +; GFX908-NEXT: v_accvgpr_write_b32 a1, 1 +; GFX908-NEXT: v_accvgpr_write_b32 a5, 6 +; GFX908-NEXT: v_accvgpr_write_b32 a6, 7 +; GFX908-NEXT: v_accvgpr_write_b32 a7, 8 +; GFX908-NEXT: v_accvgpr_write_b32 a0, 9 +; GFX908-NEXT: v_accvgpr_write_b32 a2, 2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, 3 +; GFX908-NEXT: v_accvgpr_write_b32 a4, 4 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_write_b32 a1, 5 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_endpgm + call void asm sideeffect "", "a,a,a,a"(i32 1, i32 2, i32 3, i32 4) + call void asm sideeffect "", "a,a,a,a,a"(i32 5, i32 6, i32 7, i32 8, i32 9) + ret void +} + +define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10) #1 { +; GFX908-LABEL: remat_regcopy_avoids_spill: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v3 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_write_b32 a0, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v7 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "", "a,a,a,a"(i32 %v0, i32 %v1, i32 %v2, i32 %v3) + call void asm sideeffect "", "a,a,a,a,a"(i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8) + ret void +} + +attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } Index: llvm/test/CodeGen/AMDGPU/spill-agpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -64,15 +64,26 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] -; A2V: ScratchSize: 0 -define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - call void asm sideeffect "", "a,a,a,a"(i32 1, i32 2, i32 3, i32 4) - call void asm sideeffect "", "a,a,a,a,a"(i32 5, i32 6, i32 7, i32 8, i32 9) + +; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} +; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; A2V: ScratchSize: 0 + +; A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload +; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] +define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { + %v0 = load volatile i32, i32 addrspace(3)* undef + %v1 = load volatile i32, i32 addrspace(3)* undef + %v2 = load volatile i32, i32 addrspace(3)* undef + %v3 = load volatile i32, i32 addrspace(3)* undef + %v4 = load volatile i32, i32 addrspace(3)* undef + %v5 = load volatile i32, i32 addrspace(3)* undef + %v6 = load volatile i32, i32 addrspace(3)* undef + %v7 = load volatile i32, i32 addrspace(3)* undef + call void asm sideeffect "", "a,a,a,a,~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6}"(i32 %v0, i32 %v1, i32 %v2, i32 %v3) + %v8 = load volatile i32, i32 addrspace(3)* undef + call void asm sideeffect "", "a,a,a,a,a"(i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8) ret void }