Index: llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -155,8 +155,6 @@ RegSubRegPair CombOldVGPR, bool CombBCZ) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); - assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == - TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); auto OrigOp = OrigMI.getOpcode(); auto DPPOp = getDPPOp(OrigOp); @@ -354,6 +352,7 @@ auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); assert(DstOpnd && DstOpnd->isReg()); auto DPPMovReg = DstOpnd->getReg(); + auto DPPMovSubReg = DstOpnd->getSubReg(); if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" " for all uses\n"); @@ -418,6 +417,7 @@ dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); SmallVector OrigMIs, DPPMIs; + SmallSetVector RegSeqs; auto CombOldVGPR = getRegSubRegPair(*OldOpnd); // try to reuse previous old reg if its undefined (IMPLICIT_DEF) if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef @@ -430,14 +430,45 @@ OrigMIs.push_back(&MovMI); bool Rollback = true; + SmallVector Uses; + for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) { + if (Use.getSubReg() == DPPMovSubReg) + Uses.push_back(&Use); + } + + while (!Uses.empty()) { + MachineOperand *Use = Uses.pop_back_val(); Rollback = true; - auto &OrigMI = *Use.getParent(); + auto &OrigMI = *Use->getParent(); LLVM_DEBUG(dbgs() << " try: " << OrigMI); auto OrigOp = OrigMI.getOpcode(); - if (TII->isVOP3(OrigOp)) { + if (OrigOp == AMDGPU::REG_SEQUENCE) { + Register FwdReg = OrigMI.getOperand(0).getReg(); + unsigned FwdSubReg = 0; + + if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) { + LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" + " for all uses\n"); + break; + } + + for (unsigned I = 1, E = OrigMI.getNumOperands(); I < E; I += 2) { + if (OrigMI.getOperand(I).getReg() == DPPMovReg) { + FwdSubReg = OrigMI.getOperand(I + 1).getImm(); + break; + } + } + assert(FwdSubReg); + for (auto &Op : MRI->use_nodbg_operands(FwdReg)) { + if (Op.getSubReg() == FwdSubReg) + Uses.push_back(&Op); + } + RegSeqs.insert(&OrigMI); + continue; + } else if (TII->isVOP3(OrigOp)) { if (!TII->hasVALU32BitEncoding(OrigOp)) { LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n"); break; @@ -457,14 +488,14 @@ } LLVM_DEBUG(dbgs() << " combining: " << OrigMI); - if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { + if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ)) { DPPMIs.push_back(DPPInst); Rollback = false; } } else if (OrigMI.isCommutable() && - &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { auto *BB = OrigMI.getParent(); auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); BB->insert(OrigMI, NewMI); @@ -485,9 +516,25 @@ OrigMIs.push_back(&OrigMI); } + Rollback |= !Uses.empty(); + for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) MI->eraseFromParent(); + if (!Rollback) { + for (auto S : RegSeqs) { + if (MRI->use_nodbg_empty(S->getOperand(0).getReg())) { + S->eraseFromParent(); + continue; + } + for (unsigned I = 1, E = S->getNumOperands(); I < E; I += 2) { + MachineOperand &Op = S->getOperand(I); + if (!MRI->getVRegDef(Op.getReg())) + Op.setIsUndef(true); + } + } + } + return !Rollback; } Index: llvm/test/CodeGen/AMDGPU/dpp_combine.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/dpp_combine.mir +++ llvm/test/CodeGen/AMDGPU/dpp_combine.mir @@ -562,3 +562,132 @@ %2:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec %3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $exec ... + +# CHECK-LABEL: name: dpp_reg_sequence_both_combined +# CHECK: %0:vreg_64 = COPY $vgpr0_vgpr1 +# CHECK: %1:vreg_64 = COPY $vgpr2_vgpr3 +# CHECK: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# CHECK: %9:vgpr_32 = IMPLICIT_DEF +# CHECK: %8:vgpr_32 = IMPLICIT_DEF +# CHECK: %6:vgpr_32 = V_ADD_I32_dpp %9, %1.sub0, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $exec +# CHECK: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_both_combined +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vgpr_32 = V_ADD_I32_e32 %4.sub0, %5, implicit-def $vcc, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# CHECK-LABEL: name: dpp_reg_sequence_first_combined +# CHECK: %0:vreg_64 = COPY $vgpr0_vgpr1 +# CHECK: %1:vreg_64 = COPY $vgpr2_vgpr3 +# CHECK: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# CHECK: %8:vgpr_32 = IMPLICIT_DEF +# CHECK: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec +# CHECK: %5:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, %4, %subreg.sub1 +# CHECK: %6:vgpr_32 = V_ADD_I32_dpp %8, %1.sub0, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $exec +# CHECK: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_first_combined +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vgpr_32 = V_ADD_I32_e32 %4.sub0, %5, implicit-def $vcc, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# CHECK-LABEL: name: dpp_reg_sequence_second_combined +# CHECK: %0:vreg_64 = COPY $vgpr0_vgpr1 +# CHECK: %1:vreg_64 = COPY $vgpr2_vgpr3 +# CHECK: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# CHECK: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec +# CHECK: %8:vgpr_32 = IMPLICIT_DEF +# CHECK: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, undef %4:vgpr_32, %subreg.sub1 +# CHECK: %6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %2, implicit-def $vcc, implicit $exec +# CHECK: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_second_combined +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vgpr_32 = V_ADD_I32_e32 %4.sub0, %5, implicit-def $vcc, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# CHECK-LABEL: name: dpp_reg_sequence_none_combined +# CHECK: %0:vreg_64 = COPY $vgpr0_vgpr1 +# CHECK: %1:vreg_64 = COPY $vgpr2_vgpr3 +# CHECK: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# CHECK: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec +# CHECK: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec +# CHECK: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 +# CHECK: %6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %2, implicit-def $vcc, implicit $exec +# CHECK: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_none_combined +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vgpr_32 = V_ADD_I32_e32 %4.sub0, %5, implicit-def $vcc, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# CHECK-LABEL: name: dpp_reg_sequence_exec_changed +# CHECK: %0:vreg_64 = COPY $vgpr0_vgpr1 +# CHECK: %1:vreg_64 = COPY $vgpr2_vgpr3 +# CHECK: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# CHECK: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec +# CHECK: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec +# CHECK: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 +# CHECK: S_BRANCH %bb.1 +# CHECK: bb.1: +# CHECK: %6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %2, implicit-def $vcc, implicit $exec +# CHECK: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_exec_changed +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + S_BRANCH %bb.1 + + bb.1: + %6:vgpr_32 = V_ADD_I32_e32 %4.sub0, %5, implicit-def $vcc, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +...