Index: llvm/lib/CodeGen/MachineScheduler.cpp =================================================================== --- llvm/lib/CodeGen/MachineScheduler.cpp +++ llvm/lib/CodeGen/MachineScheduler.cpp @@ -1556,6 +1556,10 @@ : TII(TII) {} void apply(ScheduleDAGInstrs *DAGInstrs) override; + +private: + bool applyImpl(ScheduleDAGMI *DAG, SUnit *ASU, SmallVectorImpl &APreds, + bool Preds); }; } // anonymous @@ -1573,43 +1577,52 @@ void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { ScheduleDAGMI *DAG = static_cast(DAGInstrs); - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch) - return; + // For each of the block SUnits, iterate over its successors. + for (SUnit &ASU : DAG->SUnits) + applyImpl(DAG, &ASU, ASU.Succs, false); + + // Iterate over the predecessors of ExitSU. + applyImpl(DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true); +} - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) +bool MacroFusion::applyImpl(ScheduleDAGMI *DAG, SUnit *ASU, + SmallVectorImpl &APreds, bool Preds) { + const MachineInstr *AMI = ASU->getInstr(); + if (!AMI || AMI->isPseudo() || AMI->isTransient()) + return false; + + for (SDep &BDep : APreds) { + if (BDep.isWeak()) continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!TII.shouldScheduleAdjacent(Pred, *Branch)) + + SUnit *BSU = BDep.getSUnit(); + const MachineInstr *BMI = BSU->getInstr(); + if (!BMI || BMI->isPseudo() || BMI->isTransient() || + !TII.shouldScheduleAdjacent(*AMI, *BMI)) continue; - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - } - for (SDep &SuccDep : SU.Succs) { - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - } + // Create a single weak edge between the adjacent instrs. The only + // effect is to cause bottom-up scheduling to heavily prioritize the + // clustered instrs. + DAG->addEdge(BSU, SDep(ASU, SDep::Cluster)); - DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); - break; + // Adjust the latency between the 1st instr and its predecessors/successors. + for (SDep &Dep : APreds) + if (Dep.getSUnit() == BSU) + Dep.setLatency(0); + + // Adjust the latency between the 2nd instr and its successors/predecessors. + SmallVectorImpl &BSuccs = Preds ? BSU->Succs : BSU->Preds; + for (SDep &Dep : BSuccs) + if (Dep.getSUnit() == ASU) + Dep.setLatency(0); + + DEBUG(dbgs() << "Macro fuse "; ASU->print(dbgs(), DAG); dbgs() << " - "; + BSU->print(dbgs(), DAG); dbgs() << '\n'); + return true; } + + return false; } //===----------------------------------------------------------------------===//