Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -1533,6 +1533,46 @@ return false; } +/// Check dependencies in \p DAG whether \p Node0 can be schedule immediately +/// before \p Node1. +static bool canScheduleAdjacent(ScheduleDAGInstrs &DAG, const SUnit &Node0, + const SUnit &Node1) { + // This is only a barebones implementation right now, limited to + // Node1==ExitSU. (This could be extended by employing + // ScheduleDAGMI.Topo.isReachable() queries on Node0 successors in the future) + assert(&Node1 == &DAG.ExitSU && "Only implemented for ExitSU node"); + for (const SDep &Succ : Node0.Succs) { + if (Succ.getSUnit() != &Node1) + return false; + } + return true; +} + +/// Add artificial edges to force adjacent scheduling of \p Node0 and \p Node1. +static void addFusionEdges(ScheduleDAGMI &DAG, SUnit &Node0, SUnit &Node1) { + assert(&Node1 == &DAG.ExitSU && + "addFusionEdges() only implemented for Node1 == ExitSU"); + // This is simpler than the general case: We only need an artifical edge from + // nodes that have no other successors except for ExitSU. + for (SUnit &SU : DAG.SUnits) { + if (&SU == &Node0) + continue; + + bool NeedEdge = true; + for (const SDep &SuccDep : SU.Succs) { + if (SuccDep.isWeak()) + continue; + const SUnit &Succ = *SuccDep.getSUnit(); + if (&Succ != &DAG.ExitSU) { + NeedEdge = false; + break; + } + } + if (NeedEdge) + DAG.addEdge(&Node0, SDep(&SU, SDep::Artificial)); + } +} + /// \brief Callback from DAG postProcessing to create cluster edges to encourage /// fused operations. void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { @@ -1545,9 +1585,6 @@ return; for (SUnit &SU : DAG->SUnits) { - // SUnits with successors can't be schedule in front of the ExitSU. - if (!SU.Succs.empty()) - continue; // We only care if the node writes to a register that the branch reads. MachineInstr *Pred = SU.getInstr(); if (!HasDataDep(TRI, *Branch, *Pred)) @@ -1556,17 +1593,12 @@ if (!TII.shouldScheduleAdjacent(*Pred, *Branch)) continue; - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); + if (!canScheduleAdjacent(*DAG, SU, ExitSU)) + continue; + + DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ") and SU(" + << ExitSU.NodeNum << ")\n"); + addFusionEdges(*DAG, SU, ExitSU); break; } } @@ -3267,7 +3299,12 @@ /// Create a generic scheduler with no vreg liveness or DAG mutation passes. static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C) { - return new ScheduleDAGMI(C, make_unique(C), /*IsPostRA=*/true); + ScheduleDAGMI *DAG = + new ScheduleDAGMI(C, make_unique(C), + /*IsPostRA=*/true); + if (EnableMacroFusion) + DAG->addMutation(createMacroFusionDAGMutation(DAG->TII, DAG->TRI)); + return DAG; } //===----------------------------------------------------------------------===// Index: test/CodeGen/AArch64/postmisched-fusion.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/postmisched-fusion.mir @@ -0,0 +1,23 @@ +# RUN: llc -o - %s -mtriple=aarch64-- -mcpu=cyclone -enable-post-misched -run-pass=postmisched | FileCheck %s +# Test that the post machine scheduler respects macro op fusion. +--- | + define void @func0() { ret void } +... +--- +# CHECK-LABEL: name: func0 +# CHECK: %xzr = SUBSXri{{.*}}implicit-def %nzcv +# CHECK-NEXT: Bcc {{.*}}implicit killed %nzcv +name: func0 +body: | + bb.0: + successors: %bb.1, %bb.2 + %x8 = IMPLICIT_DEF + %x9 = LDRXui %x8, 0 :: (load 8) + dead %xzr = SUBSXri %x8, 0, 0, implicit def %nzcv + %x10 = ADDXri %x9, 13, 0 + Bcc 1, %bb.1, implicit killed %nzcv + B %bb.2 + + bb.1: + bb.2: +...