diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1744,6 +1744,8 @@ if (!ARC) return false; + bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass); + // Rewrite the PHI's incoming values to ARC. LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI); for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { @@ -1754,7 +1756,7 @@ MachineBasicBlock *InsertMBB = nullptr; // Look at the def of Reg, ignoring all copies. - bool UseAccVGPRWrite = false; + unsigned CopyOpc = AMDGPU::COPY; if (MachineInstr *Def = MRI->getVRegDef(Reg)) { // Look at pre-existing COPY instructions from ARC: Steal the operand. If @@ -1772,21 +1774,21 @@ // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which // is unlikely to be profitable. + // + // Note that V_ACCVGPR_WRITE is only used for AGPR_32. MachineOperand &CopyIn = Def->getOperand(1); - if (!ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) && + if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) && TRI->isSGPRReg(*MRI, CopyIn.getReg())) - UseAccVGPRWrite = true; + CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; } - InsertPt = ++Def->getIterator(); InsertMBB = Def->getParent(); + InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator()); } else { InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB(); InsertPt = InsertMBB->getFirstTerminator(); } - const unsigned CopyOpc = - UseAccVGPRWrite ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; Register NewReg = MRI->createVirtualRegister(ARC); MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(), TII->get(CopyOpc), NewReg) diff --git a/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir b/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir --- a/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir @@ -40,6 +40,7 @@ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_ENDPGM 0 + ; ; GFX90A-LABEL: name: test_sgpr_init_multiuse ; GFX90A: bb.0: ; GFX90A-NEXT: successors: %bb.1(0x80000000) @@ -101,6 +102,90 @@ S_ENDPGM 0 ... +--- +name: test_sgpr_init_multiuse_agprtuple +tracksRegLiveness: true + +body: | + ; GFX908-LABEL: name: test_sgpr_init_multiuse_agprtuple + ; GFX908: bb.0: + ; GFX908-NEXT: successors: %bb.1(0x80000000) + ; GFX908-NEXT: liveins: $sgpr0_sgpr1, $scc + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]] + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_64_align2 = COPY [[COPY1]] + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[COPY1]] + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.1: + ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: liveins: $scc + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[PHI:%[0-9]+]]:areg_64_align2 = PHI [[COPY3]], %bb.0, %9.sub0_sub1, %bb.1 + ; GFX908-NEXT: [[PHI1:%[0-9]+]]:areg_64_align2 = PHI [[COPY2]], %bb.0, %9.sub2_sub3, %bb.1 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[PHI1]] + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[PHI]] + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[COPY5]].sub0, %subreg.sub0, [[COPY5]].sub1, %subreg.sub1, [[COPY4]].sub0, %subreg.sub2, [[COPY4]].sub1, %subreg.sub3 + ; GFX908-NEXT: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1073741824, implicit $exec + ; GFX908-NEXT: [[V_MOV_B64_e32_1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1065353216, implicit $exec + ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B64_e32_1]].sub0, [[V_MOV_B64_e32_]].sub1, [[REG_SEQUENCE]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.2: + ; GFX908-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_sgpr_init_multiuse_agprtuple + ; GFX90A: bb.0: + ; GFX90A-NEXT: successors: %bb.1(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr0_sgpr1, $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]] + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:areg_64_align2 = COPY [[COPY1]] + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[COPY1]] + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.1: + ; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: liveins: $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[PHI:%[0-9]+]]:areg_64_align2 = PHI [[COPY3]], %bb.0, %9.sub0_sub1, %bb.1 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:areg_64_align2 = PHI [[COPY2]], %bb.0, %9.sub2_sub3, %bb.1 + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[PHI1]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[PHI]] + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[COPY5]].sub0, %subreg.sub0, [[COPY5]].sub1, %subreg.sub1, [[COPY4]].sub0, %subreg.sub2, [[COPY4]].sub1, %subreg.sub3 + ; GFX90A-NEXT: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1073741824, implicit $exec + ; GFX90A-NEXT: [[V_MOV_B64_e32_1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1065353216, implicit $exec + ; GFX90A-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B64_e32_1]].sub0, [[V_MOV_B64_e32_]].sub1, [[REG_SEQUENCE]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.2: + ; GFX90A-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1 + liveins: $sgpr0_sgpr1, $scc + + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:vreg_64_align2 = COPY %0:sgpr_64 + + bb.1: + successors: %bb.1, %bb.2 + liveins: $scc + + %2:vreg_64_align2 = PHI %1, %bb.0, %3, %bb.1 + %4:vreg_64_align2 = PHI %1, %bb.0, %5, %bb.1 + %6:areg_128_align2 = REG_SEQUENCE %2.sub0, %subreg.sub0, %2.sub1, %subreg.sub1, %4.sub0, %subreg.sub2, %4.sub1, %subreg.sub3 + %7:vreg_64_align2 = V_MOV_B64_e32 1073741824, implicit $exec + %8:vreg_64_align2 = V_MOV_B64_e32 1065353216, implicit $exec + %9:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %8.sub0, %7.sub1, %6:areg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %3:vreg_64_align2 = COPY %9.sub0_sub1:areg_128_align2 + %5:vreg_64_align2 = COPY %9.sub2_sub3:areg_128_align2 + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + + S_ENDPGM 0 +... + --- name: test_sgpr_init_singleuse tracksRegLiveness: true @@ -141,6 +226,7 @@ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_ENDPGM 0 + ; ; GFX90A-LABEL: name: test_sgpr_init_singleuse ; GFX90A: bb.0: ; GFX90A-NEXT: successors: %bb.1(0x80000000) @@ -244,6 +330,7 @@ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_ENDPGM 0 + ; ; GFX90A-LABEL: name: test_vgpr_init ; GFX90A: bb.0: ; GFX90A-NEXT: successors: %bb.1(0x80000000) @@ -343,6 +430,7 @@ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_ENDPGM 0 + ; ; GFX90A-LABEL: name: test_use_vgpr_temp ; GFX90A: bb.0: ; GFX90A-NEXT: successors: %bb.1(0x80000000) @@ -445,6 +533,7 @@ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_ENDPGM 0 + ; ; GFX90A-LABEL: name: test_vgpr_init_two_copies ; GFX90A: bb.0: ; GFX90A-NEXT: successors: %bb.1(0x80000000) @@ -504,3 +593,125 @@ bb.2: S_ENDPGM 0 ... + +--- +name: test_vgpr_init_skip_phis_insertpt +tracksRegLiveness: true + +body: | + ; GFX908-LABEL: name: test_vgpr_init_skip_phis_insertpt + ; GFX908: bb.0: + ; GFX908-NEXT: successors: %bb.1(0x80000000) + ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $scc + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.1: + ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX908-NEXT: liveins: $scc + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[COPY1]], %bb.1 + ; GFX908-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[COPY1]], %bb.1 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; GFX908-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.2: + ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX908-NEXT: liveins: $scc + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[PHI2:%[0-9]+]]:agpr_32 = PHI [[COPY5]], %bb.1, %15.sub0, %bb.2 + ; GFX908-NEXT: [[PHI3:%[0-9]+]]:agpr_32 = PHI [[COPY4]], %bb.1, %15.sub1, %bb.2 + ; GFX908-NEXT: [[PHI4:%[0-9]+]]:agpr_32 = PHI [[COPY3]], %bb.1, %15.sub2, %bb.2 + ; GFX908-NEXT: [[PHI5:%[0-9]+]]:agpr_32 = PHI [[COPY2]], %bb.1, %15.sub3, %bb.2 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI5]] + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[PHI4]] + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[PHI3]] + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[PHI2]] + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3: + ; GFX908-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: test_vgpr_init_skip_phis_insertpt + ; GFX90A: bb.0: + ; GFX90A-NEXT: successors: %bb.1(0x80000000) + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.1: + ; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: liveins: $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[COPY1]], %bb.1 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[COPY1]], %bb.1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.2: + ; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX90A-NEXT: liveins: $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:agpr_32 = PHI [[COPY5]], %bb.1, %15.sub0, %bb.2 + ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:agpr_32 = PHI [[COPY4]], %bb.1, %15.sub1, %bb.2 + ; GFX90A-NEXT: [[PHI4:%[0-9]+]]:agpr_32 = PHI [[COPY3]], %bb.1, %15.sub2, %bb.2 + ; GFX90A-NEXT: [[PHI5:%[0-9]+]]:agpr_32 = PHI [[COPY2]], %bb.1, %15.sub3, %bb.2 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[PHI5]] + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[PHI4]] + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[PHI3]] + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[PHI2]] + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + ; GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GFX90A-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.3: + ; GFX90A-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $vgpr1, $scc + successors: %bb.1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr0 + + bb.1: + liveins: $scc + successors: %bb.1, %bb.2 + + %6:vgpr_32 = PHI %0, %bb.0, %1, %bb.1 + %7:vgpr_32 = PHI %0, %bb.0, %1, %bb.1 + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + liveins: $scc + successors: %bb.2, %bb.3 + %8:vgpr_32 = PHI %6, %bb.1, %16, %bb.2 + %9:vgpr_32 = PHI %6, %bb.1, %17, %bb.2 + %10:vgpr_32 = PHI %6, %bb.1, %18, %bb.2 + %11:vgpr_32 = PHI %6, %bb.1, %19, %bb.2 + %12:areg_128_align2 = REG_SEQUENCE %8, %subreg.sub0, %9, %subreg.sub1, %10, %subreg.sub2, %11, %subreg.sub3 + %13:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + %14:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + %15:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %14:vgpr_32, %13:vgpr_32, %12:areg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %16:vgpr_32 = COPY %15.sub0 + %17:vgpr_32 = COPY %15.sub1 + %18:vgpr_32 = COPY %15.sub2 + %19:vgpr_32 = COPY %15.sub3 + S_CBRANCH_SCC1 %bb.2, implicit $scc + + bb.3: + S_ENDPGM 0 +...