diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -158,5 +158,72 @@ Changed |= processReg(Reg); } + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + MachineBasicBlock *MBB = &*BI; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + MachineInstr &MI = *I; + + switch (MI.getOpcode()) { + default: + continue; + case AMDGPU::COPY: { + // Some subtargets cannot do an AGPR to AGPR copy directly, and need an + // intermdiate temporary VGPR register. Try to find the defining + // accvgpr_write to avoid temporary registers. + + // GFX90A can directly copy AGPR to AGPR so nothing to do. + if (ST.hasGFX90AInsts()) + continue; + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + if (!DstReg.isVirtual() || !SrcReg.isVirtual()) + continue; + + // Check if copy is AGPR to AGPR + bool IsAGPRDst = TRI->hasAGPRs(MRI->getRegClass(DstReg)); + bool IsAGPRSrc = TRI->hasAGPRs(MRI->getRegClass(SrcReg)); + if (!IsAGPRDst || !IsAGPRSrc) + continue; + + // def_instructions() does not look at subregs so it may give us a + // different instruction that defines the same vreg but diferent subreg + // so we have to manually check subreg. + Register SrcSubReg = MI.getOperand(1).getSubReg(); + + for (auto &Def : MRI->def_instructions(SrcReg)) { + if (SrcSubReg != Def.getOperand(0).getSubReg()) + continue; + + if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { + MachineOperand DefSrcMO = Def.getOperand(1); + + // Immediates are not an issue and can be propagated in + // postrapseudos pass. Only handle cases where defining + // accvgpr_write source is a vreg. + if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) { + // Propagate source reg of accvgpr write to this copy instruction + MI.getOperand(1).setReg(DefSrcMO.getReg()); + MI.getOperand(1).setSubReg(DefSrcMO.getSubReg()); + + LIS->removeInterval(DefSrcMO.getReg()); + LIS->removeInterval(SrcReg); + LIS->createAndComputeVirtRegInterval(DefSrcMO.getReg()); + LIS->createAndComputeVirtRegInterval(SrcReg); + Changed = true; + } + + break; + } + } + + break; + } + } + } + } + return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir @@ -0,0 +1,77 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=liveintervals,amdgpu-pre-ra-optimizations -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s + +--- +name: test_mfma_f32_4x4x1f32_propagate_vgpr +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX908-LABEL: name: test_mfma_f32_4x4x1f32_propagate_vgpr + ; GFX908: liveins: $sgpr0_sgpr1 + ; GFX908: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GFX908: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + ; GFX908: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec + ; GFX908: undef %4.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_1]], implicit $exec + ; GFX908: %4.sub1:areg_128 = COPY [[V_MOV_B32_e32_1]] + ; GFX908: %4.sub2:areg_128 = COPY [[V_MOV_B32_e32_1]] + ; GFX908: %4.sub3:areg_128 = COPY [[V_MOV_B32_e32_1]] + ; GFX908: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + ; GFX908: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GFX908: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], %4, 0, 0, 0, implicit $mode, implicit $exec + ; GFX908: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]] + ; GFX908: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; GFX908: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %13:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec + undef %11.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %13:vgpr_32, implicit $exec + %11.sub1:areg_128 = COPY %11.sub0:areg_128 + %11.sub2:areg_128 = COPY %11.sub0:areg_128 + %11.sub3:areg_128 = COPY %11.sub0:areg_128 + %8:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + %9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %9:vgpr_32, %8:vgpr_32, %11:areg_128, 0, 0, 0, implicit $mode, implicit $exec + %12:vreg_128 = COPY %10:areg_128 + GLOBAL_STORE_DWORDX4_SADDR %5:vgpr_32, %12:vreg_128, %4:sreg_64_xexec, 0, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 +... +--- +name: test_mfma_f32_4x4x1f32_no_propagate_imm +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX908-LABEL: name: test_mfma_f32_4x4x1f32_no_propagate_imm + ; GFX908: liveins: $sgpr0_sgpr1 + ; GFX908: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GFX908: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + ; GFX908: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908: undef %3.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec + ; GFX908: %3.sub1:areg_128 = COPY %3.sub0 + ; GFX908: %3.sub2:areg_128 = COPY %3.sub0 + ; GFX908: %3.sub3:areg_128 = COPY %3.sub0 + ; GFX908: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + ; GFX908: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GFX908: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], %3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX908: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]] + ; GFX908: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; GFX908: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + undef %11.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec + %11.sub1:areg_128 = COPY %11.sub0:areg_128 + %11.sub2:areg_128 = COPY %11.sub0:areg_128 + %11.sub3:areg_128 = COPY %11.sub0:areg_128 + %8:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + %9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %9:vgpr_32, %8:vgpr_32, %11:areg_128, 0, 0, 0, implicit $mode, implicit $exec + %12:vreg_128 = COPY %10:areg_128 + GLOBAL_STORE_DWORDX4_SADDR %5:vgpr_32, %12:vreg_128, %4:sreg_64_xexec, 0, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -578,19 +578,12 @@ ret void } -; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved -; in the middle of the expanded agpr reg_sequence. The broadcast of -; the individual AGPR->AGPR components should avoid the intermediate AGPR case. ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX908_A: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000 ; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]] -; GFX908: s_nop 0 -; GFX908: v_accvgpr_read_b32 [[TMP1:v[0-9]+]], [[AGPR]] -; GFX908: v_accvgpr_read_b32 [[TMP2:v[0-9]+]], [[AGPR]] -; GFX908: v_accvgpr_read_b32 [[TMP3:v[0-9]+]], [[AGPR]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] +; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] +; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] ; GFX90A-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]] ; GCN: s_nop 0 ; GFX908_A: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]