diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -540,8 +540,9 @@ .addReg(SrcReg, getKillRegState(KillSrc)); } -/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible -/// to directly copy, so an intermediate VGPR needs to be used. +/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not +/// possible to have a direct copy in these cases on GFX908, so an intermediate +/// VGPR copy is required. static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -550,10 +551,18 @@ RegScavenger &RS, Register ImpDefSuperReg = Register(), Register ImpUseSuperReg = Register()) { - const SIRegisterInfo &RI = TII.getRegisterInfo(); + assert((TII.getSubtarget().hasMAIInsts() && + !TII.getSubtarget().hasGFX90AInsts()) && + "Expected GFX908 subtarget."); + + assert((AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)) && + "Source register of the copy should be either an SGPR or an AGPR."); - assert(AMDGPU::SReg_32RegClass.contains(SrcReg) || - AMDGPU::AGPR_32RegClass.contains(SrcReg)); + assert(AMDGPU::AGPR_32RegClass.contains(DestReg) && + "Destination register of the copy should be an AGPR."); + + const SIRegisterInfo &RI = TII.getRegisterInfo(); // First try to find defining accvgpr_write to avoid temporary registers. for (auto Def = MI, E = MBB.begin(); Def != E; ) { @@ -605,23 +614,18 @@ // Registers in the sequence are allocated contiguously so we can just // use register number to pick one of three round-robin temps. unsigned RegNo = DestReg % 3; - Register Tmp; - if (!TII.getSubtarget().hasGFX90AInsts()) { - Tmp = AMDGPU::VGPR32; - assert(MBB.getParent()->getRegInfo().isReserved(AMDGPU::VGPR32)); - - // Only loop through if there are any free registers left, otherwise - // scavenger may report a fatal error without emergency spill slot - // or spill with the slot. - while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { - Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); - if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) - break; - Tmp = Tmp2; - RS.setRegUsed(Tmp); - } - } else { - Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + Register Tmp = AMDGPU::VGPR32; + assert(MBB.getParent()->getRegInfo().isReserved(Tmp) && + "VGPR used for an intermediate copy should have been reserved."); + + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; RS.setRegUsed(Tmp); } diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -2,10 +2,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; This testcase would fail due to not having a free VGPR available to +; This testcase would fail on GFX908 due to not having a free VGPR available to ; copy between AGPRs. -define void @no_free_vgprs_at_agpr_copy(float %v0, float %v1) #0 { -; GFX908-LABEL: no_free_vgprs_at_agpr_copy: +define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { +; GFX908-LABEL: no_free_vgprs_at_agpr_to_agpr_copy: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v33, v1 @@ -156,7 +156,7 @@ ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: no_free_vgprs_at_agpr_copy: +; GFX90A-LABEL: no_free_vgprs_at_agpr_to_agpr_copy ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v33, v0 @@ -864,6 +864,256 @@ br i1 %i66, label %bb16, label %bb12 } +; This testcase would fail on GFX908 due to not having a free VGPR available to +; copy SGPR to AGPR. +define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { +; GFX908-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v33, v1 +; GFX908-NEXT: v_mov_b32_e32 v34, v0 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def v[0:31] s[0:15] +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_mov_b32_e32 v32, s15 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s14 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s13 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s12 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s11 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s10 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s9 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s8 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s6 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s5 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s4 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s3 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s2 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s1 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v32 +; GFX908-NEXT: v_mov_b32_e32 v32, s0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a0 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a3 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a4 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a5 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a6 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a7 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a8 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a9 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v32, a10 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; copy +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a32, v32 +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a0, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a1, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a2, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a5, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a6, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a7, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a9, v32 ; Reload Reuse +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a10, v32 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse +; GFX908-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; copy +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v33, a2 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v33 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use a3 v[0:31] +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v33, v0 +; GFX90A-NEXT: v_mov_b32_e32 v32, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:31] s[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; copy +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a1 +; GFX90A-NEXT: buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; copy +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a3 v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v34 ; Reload Reuse +; GFX90A-NEXT: s_setpc_b64 s[30:31] + %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${s[0:15]}"() + %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0 + %agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1 + %mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0) + %agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0) + %agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma) + call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0) + ret void +} + declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1 declare i32 @llvm.amdgcn.workitem.id.x() #2