diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -40,6 +40,7 @@ class GCNPreRAOptimizations : public MachineFunctionPass { private: const SIInstrInfo *TII; + const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; LiveIntervals *LIS; @@ -85,32 +86,106 @@ MachineInstr *Def0 = nullptr; MachineInstr *Def1 = nullptr; uint64_t Init = 0; + bool Changed = false; + SmallSet ModifiedRegs; + bool IsAGPRDst = TRI->isAGPRClass(MRI->getRegClass(Reg)); for (MachineInstr &I : MRI->def_instructions(Reg)) { - if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg || - !I.getOperand(1).isImm() || I.getNumOperands() != 2) - return false; - - switch (I.getOperand(0).getSubReg()) { + switch (I.getOpcode()) { default: return false; - case AMDGPU::sub0: - if (Def0) - return false; - Def0 = &I; - Init |= I.getOperand(1).getImm() & 0xffffffff; + case AMDGPU::V_ACCVGPR_WRITE_B32_e64: break; - case AMDGPU::sub1: - if (Def1) + case AMDGPU::COPY: { + // Some subtargets cannot do an AGPR to AGPR copy directly, and need an + // intermdiate temporary VGPR register. Try to find the defining + // accvgpr_write to avoid temporary registers. + if (!IsAGPRDst) + break; + + Register SrcReg = I.getOperand(1).getReg(); + + if (!SrcReg.isVirtual()) + break; + + // Check if source of copy is from another AGPR. + bool IsAGPRSrc = TRI->isAGPRClass(MRI->getRegClass(SrcReg)); + if (!IsAGPRSrc) + break; + + // def_instructions() does not look at subregs so it may give us a + // different instruction that defines the same vreg but different subreg + // so we have to manually check subreg. + Register SrcSubReg = I.getOperand(1).getSubReg(); + for (auto &Def : MRI->def_instructions(SrcReg)) { + if (SrcSubReg != Def.getOperand(0).getSubReg()) + continue; + + if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { + MachineOperand DefSrcMO = Def.getOperand(1); + + // Immediates are not an issue and can be propagated in + // postrapseudos pass. Only handle cases where defining + // accvgpr_write source is a vreg. + if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) { + // Propagate source reg of accvgpr write to this copy instruction + I.getOperand(1).setReg(DefSrcMO.getReg()); + I.getOperand(1).setSubReg(DefSrcMO.getSubReg()); + + // Reg uses were changed, collect unique set of registers to update + // live intervals at the end. + ModifiedRegs.insert(DefSrcMO.getReg()); + ModifiedRegs.insert(SrcReg); + + Changed = true; + } + + // Found the defining accvgpr_write, stop looking any further. + break; + } + } + break; + } + case AMDGPU::S_MOV_B32: + if (I.getOperand(0).getReg() != Reg || !I.getOperand(1).isImm() || + I.getNumOperands() != 2) + return false; + + switch (I.getOperand(0).getSubReg()) { + default: return false; - Def1 = &I; - Init |= static_cast(I.getOperand(1).getImm()) << 32; + case AMDGPU::sub0: + if (Def0) + return false; + Def0 = &I; + Init |= I.getOperand(1).getImm() & 0xffffffff; + break; + case AMDGPU::sub1: + if (Def1) + return false; + Def1 = &I; + Init |= static_cast(I.getOperand(1).getImm()) << 32; + break; + } break; } } + // For AGPR reg, check if live intervals need to be updated. + if (IsAGPRDst) { + if (Changed) { + for (Register RegToUpdate : ModifiedRegs) { + LIS->removeInterval(RegToUpdate); + LIS->createAndComputeVirtRegInterval(RegToUpdate); + } + } + + return Changed; + } + + // For SGPR reg, check if we can combine instructions. if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent()) - return false; + return Changed; LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1 << " =>\n"); @@ -144,7 +219,7 @@ TII = ST.getInstrInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); + TRI = ST.getRegisterInfo(); bool Changed = false; @@ -153,8 +228,10 @@ if (!LIS->hasInterval(Reg)) continue; const TargetRegisterClass *RC = MRI->getRegClass(Reg); - if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) + if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) && + (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC))) continue; + Changed |= processReg(Reg); } diff --git a/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir @@ -0,0 +1,139 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=liveintervals,amdgpu-pre-ra-optimizations -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s + +--- +name: test_mfma_f32_4x4x1f32_propagate_vgpr +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX908-LABEL: name: test_mfma_f32_4x4x1f32_propagate_vgpr + ; GFX908: liveins: $sgpr0_sgpr1 + ; GFX908: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GFX908: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + ; GFX908: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec + ; GFX908: undef %4.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_1]], implicit $exec + ; GFX908: %4.sub1:areg_128 = COPY [[V_MOV_B32_e32_1]] + ; GFX908: %4.sub2:areg_128 = COPY [[V_MOV_B32_e32_1]] + ; GFX908: %4.sub3:areg_128 = COPY [[V_MOV_B32_e32_1]] + ; GFX908: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + ; GFX908: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GFX908: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], %4, 0, 0, 0, implicit $mode, implicit $exec + ; GFX908: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]] + ; GFX908: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; GFX908: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %13:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec + undef %11.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %13:vgpr_32, implicit $exec + %11.sub1:areg_128 = COPY %11.sub0:areg_128 + %11.sub2:areg_128 = COPY %11.sub0:areg_128 + %11.sub3:areg_128 = COPY %11.sub0:areg_128 + %8:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + %9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %9:vgpr_32, %8:vgpr_32, %11:areg_128, 0, 0, 0, implicit $mode, implicit $exec + %12:vreg_128 = COPY %10:areg_128 + GLOBAL_STORE_DWORDX4_SADDR %5:vgpr_32, %12:vreg_128, %4:sreg_64_xexec, 0, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 +... +--- +name: test_mfma_f32_4x4x1f32_no_propagate_imm +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX908-LABEL: name: test_mfma_f32_4x4x1f32_no_propagate_imm + ; GFX908: liveins: $sgpr0_sgpr1 + ; GFX908: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GFX908: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + ; GFX908: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908: undef %3.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec + ; GFX908: %3.sub1:areg_128 = COPY %3.sub0 + ; GFX908: %3.sub2:areg_128 = COPY %3.sub0 + ; GFX908: %3.sub3:areg_128 = COPY %3.sub0 + ; GFX908: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + ; GFX908: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GFX908: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], %3, 0, 0, 0, implicit $mode, implicit $exec + ; GFX908: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]] + ; GFX908: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; GFX908: S_ENDPGM 0 + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + undef %11.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec + %11.sub1:areg_128 = COPY %11.sub0:areg_128 + %11.sub2:areg_128 = COPY %11.sub0:areg_128 + %11.sub3:areg_128 = COPY %11.sub0:areg_128 + %8:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + %9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %9:vgpr_32, %8:vgpr_32, %11:areg_128, 0, 0, 0, implicit $mode, implicit $exec + %12:vreg_128 = COPY %10:areg_128 + GLOBAL_STORE_DWORDX4_SADDR %5:vgpr_32, %12:vreg_128, %4:sreg_64_xexec, 0, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 +... +--- +name: test_vgpr_subreg_propagate +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908-LABEL: name: test_vgpr_subreg_propagate + ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX908: undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec + ; GFX908: %1.sub1:areg_128 = COPY [[COPY]].sub0 + ; GFX908: %1.sub2:areg_128 = COPY [[COPY]].sub0 + ; GFX908: %1.sub3:areg_128 = COPY [[COPY]].sub0 + ; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1 + %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec + %1.sub1:areg_128 = COPY %1.sub0:areg_128 + %1.sub2:areg_128 = COPY %1.sub0:areg_128 + %1.sub3:areg_128 = COPY %1.sub0:areg_128 + S_ENDPGM 0, implicit %0, implicit %1 +... +--- +name: test_nonmatching_agpr_subreg_no_propagate +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GFX908-LABEL: name: test_nonmatching_agpr_subreg_no_propagate + ; GFX908: liveins: $vgpr0_vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec + ; GFX908: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec + ; GFX908: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec + ; GFX908: [[COPY1:%[0-9]+]]:areg_64 = COPY %1 + ; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]] + %0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec + undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec + %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec + %2:areg_64 = COPY %1:areg_64 + S_ENDPGM 0, implicit %0, implicit %1, implicit %2 +... +--- +name: test_subreg_to_single_agpr_reg_propagate +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GFX908-LABEL: name: test_subreg_to_single_agpr_reg_propagate + ; GFX908: liveins: $vgpr0_vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec + ; GFX908: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec + ; GFX908: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec + ; GFX908: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]].sub1 + ; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]] + %0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec + undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec + %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec + %2:agpr_32 = COPY %1.sub1:areg_64 + S_ENDPGM 0, implicit %0, implicit %1, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -578,19 +578,12 @@ ret void } -; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved -; in the middle of the expanded agpr reg_sequence. The broadcast of -; the individual AGPR->AGPR components should avoid the intermediate AGPR case. ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX908_A: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000 ; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]] -; GFX908: s_nop 0 -; GFX908: v_accvgpr_read_b32 [[TMP1:v[0-9]+]], [[AGPR]] -; GFX908: v_accvgpr_read_b32 [[TMP2:v[0-9]+]], [[AGPR]] -; GFX908: v_accvgpr_read_b32 [[TMP3:v[0-9]+]], [[AGPR]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] +; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] +; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]] ; GFX90A-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]] ; GCN: s_nop 0 ; GFX908_A: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]