diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -59,16 +59,22 @@ #define DEBUG_TYPE "gcn-dpp-combine" +static const unsigned DefaultMaxDPPRegUses = 16; + +static cl::opt MaxDPPRegUses( + "dpp-combine-max-reg-uses", cl::init(DefaultMaxDPPRegUses), cl::Hidden, + cl::desc("Maximum number of uses for a DPP register to combine")); + STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined."); namespace { +using RegSubRegPair = TargetInstrInfo::RegSubRegPair; + class GCNDPPCombine : public MachineFunctionPass { MachineRegisterInfo *MRI; const SIInstrInfo *TII; - using RegSubRegPair = TargetInstrInfo::RegSubRegPair; - MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; MachineInstr *createDPPInst(MachineInstr &OrigMI, @@ -352,6 +358,70 @@ return (Imm->getImm() & Mask) == Value; } +static bool addUse(MachineOperand *Use, + SmallVectorImpl &Uses) { + if (Uses.size() == MaxDPPRegUses) { + LLVM_DEBUG(dbgs() << " " << *Use->getParent() << " failed: exceeded the " + "maximum number of uses " + "for a DPP register\n"); + return false; + } + + // Assuming per instruction uses come together, + // see defusechain_iterator::operator++ + if (!Uses.empty() && Uses.back()->getParent() == Use->getParent()) { + LLVM_DEBUG( + dbgs() + << " " << *Use->getParent() + << " failed: DPP register is used more than once per instruction\n"); + return false; + } + + Uses.push_back(Use); + return true; +} + +// Store uses of RSR, checking that RSR.SubReg is used as independent lane +// (i.e not as part of the whole reg or other subreg) +static bool collectIndepSubRegUse(MachineRegisterInfo &MRI, RegSubRegPair RSR, + SmallVectorImpl &Uses) { + const auto &TRI = + *static_cast(MRI.getTargetRegisterInfo()); + LaneBitmask Mask = TRI.getSubRegIndexLaneMask(RSR.SubReg); + for (auto &Op : MRI.use_nodbg_operands(RSR.Reg)) { + if (Op.getSubReg() == RSR.SubReg) { + if (!addUse(&Op, Uses)) + return false; + } else if (!Op.getSubReg() || + (Mask & TRI.getSubRegIndexLaneMask(Op.getSubReg())).any()) + return false; + } + return true; +} + +static bool collectRegUse(MachineRegisterInfo &MRI, Register R, + SmallVectorImpl &Uses) { + for (auto &Use : MRI.use_nodbg_operands(R)) { + auto &MI = *Use.getParent(); + switch (MI.getOpcode()) { + case AMDGPU::REG_SEQUENCE: { + unsigned OpNo = MI.getOperandNo(&Use); + assert(OpNo > 0 && "Use cannot be a destination register"); + auto RSR = RegSubRegPair(MI.getOperand(0).getReg(), + MI.getOperand(OpNo + 1).getImm()); + if (!collectIndepSubRegUse(MRI, RSR, Uses)) + return false; + break; + } + // TODO: add INSERT_SUBREG? + default: + if (!addUse(&Use, Uses)) + return false; + } + } + return true; +} + bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); @@ -363,11 +433,6 @@ LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n"); return false; } - if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { - LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" - " for all uses\n"); - return false; - } auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); assert(RowMaskOpnd && RowMaskOpnd->isImm()); @@ -432,8 +497,17 @@ dbgs() << *OldOpndValue; dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); + SmallVector Uses; + if (!collectRegUse(*MRI, DPPMovReg, Uses)) + return false; + + if (execMayBeModifiedBeforeAnyUse(*MRI, MovMI, Uses)) { + LLVM_DEBUG( + dbgs() << " failed: EXEC mask should remain the same for all uses\n"); + return false; + } + SmallVector OrigMIs, DPPMIs; - DenseMap> RegSeqWithOpNos; auto CombOldVGPR = getRegSubRegPair(*OldOpnd); // try to reuse previous old reg if its undefined (IMPLICIT_DEF) if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef @@ -444,50 +518,14 @@ DPPMIs.push_back(UndefInst.getInstr()); } - OrigMIs.push_back(&MovMI); bool Rollback = true; - SmallVector Uses; - - for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) { - Uses.push_back(&Use); - } - while (!Uses.empty()) { MachineOperand *Use = Uses.pop_back_val(); Rollback = true; auto &OrigMI = *Use->getParent(); LLVM_DEBUG(dbgs() << " try: " << OrigMI); - auto OrigOp = OrigMI.getOpcode(); - if (OrigOp == AMDGPU::REG_SEQUENCE) { - Register FwdReg = OrigMI.getOperand(0).getReg(); - unsigned FwdSubReg = 0; - - if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) { - LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" - " for all uses\n"); - break; - } - - unsigned OpNo, E = OrigMI.getNumOperands(); - for (OpNo = 1; OpNo < E; OpNo += 2) { - if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) { - FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm(); - break; - } - } - - if (!FwdSubReg) - break; - - for (auto &Op : MRI->use_nodbg_operands(FwdReg)) { - if (Op.getSubReg() == FwdSubReg) - Uses.push_back(&Op); - } - RegSeqWithOpNos[&OrigMI].push_back(OpNo); - continue; - } if (TII->isVOP3(OrigOp)) { if (!TII->hasVALU32BitEncoding(OrigOp)) { @@ -539,20 +577,17 @@ Rollback |= !Uses.empty(); - for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) - MI->eraseFromParent(); - + // Don't delete the original dpp mov instruction because it may still be + // used by a REG_SEQUENCE(potentialy orphaned) producing invalid MIR. It will + // soon be deleted by the DCE. if (!Rollback) { - for (auto &S : RegSeqWithOpNos) { - if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) { - S.first->eraseFromParent(); - continue; - } - while (!S.second.empty()) - S.first->getOperand(S.second.pop_back_val()).setIsUndef(true); - } + auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); + Src0->setIsKill(false); } + for (auto *MI : *(Rollback ? &DPPMIs : &OrigMIs)) + MI->eraseFromParent(); + return !Rollback; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1091,12 +1091,12 @@ const MachineInstr &DefMI, const MachineInstr &UseMI); -/// \brief Return false if EXEC is not changed between the def of \p VReg at \p -/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to -/// track between blocks. -bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, - Register VReg, - const MachineInstr &DefMI); +/// \brief Return false if EXEC is not changed between the \p DefMI and all its +/// \p Uses. Should be run on SSA. Currently does not attempt to track between +/// blocks. +bool execMayBeModifiedBeforeAnyUse( + const MachineRegisterInfo &MRI, MachineInstr &DefMI, + const SmallVectorImpl &Uses); namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7015,45 +7015,40 @@ return false; } -bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, - Register VReg, - const MachineInstr &DefMI) { +bool llvm::execMayBeModifiedBeforeAnyUse( + const MachineRegisterInfo &MRI, MachineInstr &DefMI, + const SmallVectorImpl &Uses) { assert(MRI.isSSA() && "Must be run on SSA"); + const auto &TRI = *MRI.getTargetRegisterInfo(); - auto *TRI = MRI.getTargetRegisterInfo(); - auto *DefBB = DefMI.getParent(); - - const int MaxUseInstScan = 10; - int NumUseInst = 0; - - for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { - // Don't bother searching between blocks, although it is possible this block - // doesn't modify exec. - if (UseInst.getParent() != DefBB) + for (const auto &Use : Uses) + if (Use->getParent()->getParent() != DefMI.getParent()) return true; - if (++NumUseInst > MaxUseInstScan) - return true; - } + DenseSet Users; + for (const auto &Use : Uses) + Users.insert(Use->getParent()); const int MaxInstScan = 20; int NumInst = 0; // Stop scan when we have seen all the uses. for (auto I = std::next(DefMI.getIterator()); ; ++I) { + assert(I != DefMI.getParent()->end()); if (I->isDebugInstr()) continue; + Users.erase(&*I); + if (Users.empty()) + break; + if (++NumInst > MaxInstScan) return true; - if (I->readsRegister(VReg)) - if (--NumUseInst == 0) - return false; - - if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + if (I->modifiesRegister(AMDGPU::EXEC, &TRI)) return true; } + return false; } MachineInstr *SIInstrInfo::createPHIDestinationCopy( diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir @@ -630,8 +630,9 @@ # GCN: %1:vreg_64 = COPY $vgpr2_vgpr3 # GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec # GCN: %8:vgpr_32 = IMPLICIT_DEF +# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec # GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec -# GCN: %5:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, %4, %subreg.sub1 +# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 # GCN: %6:vgpr_32 = V_ADD_I32_dpp %8, %1.sub0, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $exec # GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec name: dpp_reg_sequence_first_combined @@ -656,7 +657,8 @@ # GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec # GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec # GCN: %8:vgpr_32 = IMPLICIT_DEF -# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, undef %4:vgpr_32, %subreg.sub1 +# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec +# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 # GCN: %6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %2, implicit-def $vcc, implicit $exec # GCN: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec name: dpp_reg_sequence_second_combined @@ -784,8 +786,9 @@ ... # GCN-LABEL: name: dpp64_add64_first_combined +# GCN: %7:vgpr_32 = V_MOV_B32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, 1, 15, 15, 1, implicit $exec # GCN: %8:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, 1, 15, 15, 1, implicit $exec -# GCN: %0:vreg_64 = REG_SEQUENCE undef %7:vgpr_32, %subreg.sub0, %8, %subreg.sub1 +# GCN: %0:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, %8, %subreg.sub1 # GCN: %3:vgpr_32 = V_ADD_I32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $exec # GCN: %5:vgpr_32, dead %6:sreg_64_xexec = V_ADDC_U32_e64 1, %0.sub1, undef $vcc, 0, implicit $exec name: dpp64_add64_first_combined @@ -833,3 +836,72 @@ S_ENDPGM 0, implicit %4 ... + +# GCN-LABEL: name: dont_combine_more_than_one_operand +# GCN: %3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec +name: dont_combine_more_than_one_operand +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: dont_combine_more_than_one_operand_dpp_reg_sequence +# GCN: %5:vgpr_32 = V_ADD_I32_e32 %4.sub0, %4.sub0, implicit-def $vcc, implicit $exec +# GCN: %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec +name: dont_combine_more_than_one_operand_dpp_reg_sequence +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5:vgpr_32 = V_ADD_I32_e32 %4.sub0, %4.sub0, implicit-def $vcc, implicit $exec + %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: dont_combine_dpp_reg_sequence_wholereg_used +# GCN: %6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %2, implicit-def $vcc, implicit $exec +# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dont_combine_dpp_reg_sequence_wholereg_used +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %2, implicit-def $vcc, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec + S_NOP 1, implicit %5 +... + +# GCN-LABEL: name: dpp_reg_sequence_use_supersubreg_for_second +# GCN: %8:vgpr_32 = IMPLICIT_DEF +# GCN: %6:vgpr_32 = V_ADD_I32_dpp %8, %1.sub0, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $exec +# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub2, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_use_supersubreg_for_second +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %5:vreg_128 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %4, %subreg.sub2, %2, %subreg.sub3 + %6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %2, implicit-def $vcc, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %5.sub2, %2, implicit-def $vcc, implicit $vcc, implicit $exec + S_NOP 1, implicit %5.sub2_sub3 +...