Index: include/llvm/CodeGen/ScheduleDAG.h =================================================================== --- include/llvm/CodeGen/ScheduleDAG.h +++ include/llvm/CodeGen/ScheduleDAG.h @@ -289,6 +289,7 @@ bool isCloned : 1; // True if this node has been cloned. bool isUnbuffered : 1; // Uses an unbuffered resource. bool hasReservedResource : 1; // Uses a reserved resource. + bool clusteredWithBottom : 1; // Node clustered with bottom boundary. Sched::Preference SchedulingPref; // Scheduling preference. private: @@ -315,9 +316,10 @@ isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isScheduleLow(false), isCloned(false), isUnbuffered(false), hasReservedResource(false), - SchedulingPref(Sched::None), isDepthCurrent(false), - isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), - BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {} + clusteredWithBottom(false), SchedulingPref(Sched::None), + isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), + TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(nullptr), + CopySrcRC(nullptr) {} /// SUnit - Construct an SUnit for post-regalloc scheduling to represent /// a MachineInstr. @@ -331,9 +333,10 @@ isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isScheduleLow(false), isCloned(false), isUnbuffered(false), hasReservedResource(false), - SchedulingPref(Sched::None), isDepthCurrent(false), - isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), - BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {} + clusteredWithBottom(false), SchedulingPref(Sched::None), + isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), + TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(nullptr), + CopySrcRC(nullptr) {} /// SUnit - Construct a placeholder SUnit. SUnit() @@ -346,9 +349,10 @@ isPending(false), isAvailable(false), isScheduled(false), isScheduleHigh(false), isScheduleLow(false), isCloned(false), isUnbuffered(false), hasReservedResource(false), - SchedulingPref(Sched::None), isDepthCurrent(false), - isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), - BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {} + clusteredWithBottom(false), SchedulingPref(Sched::None), + isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), + TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(nullptr), + CopySrcRC(nullptr) {} /// \brief Boundary nodes are placeholders for the boundary of the /// scheduling region. Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -1536,15 +1536,15 @@ /// \brief Callback from DAG postProcessing to create cluster edges to encourage /// fused operations. void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast(DAGInstrs); + ScheduleDAGMI &DAG = static_cast(*DAGInstrs); // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; + SUnit &ExitSU = DAG.ExitSU; MachineInstr *Branch = ExitSU.getInstr(); if (!Branch) return; - for (SUnit &SU : DAG->SUnits) { + for (SUnit &SU : DAG.SUnits) { // SUnits with successors can't be schedule in front of the ExitSU. if (!SU.Succs.empty()) continue; @@ -1557,15 +1557,19 @@ continue; // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); + // bottom-up scheduling to heavily prioritize the clustered SU. + bool Success = DAG.addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); (void)Success; assert(Success && "No DAG nodes should be reachable from ExitSU"); + // Currently only works for clustering with the ExitSU. If this ever needs + // to be extended to arbitrary nodes then we probably need to copy the preds + // of the second to the first node and the succs of the second to the first + // as weak edges. + assert(ExitSU.isBoundaryNode() && "Only works for ExitSU for now."); + // This will defer scheduling of \p SU in top-down scheduling. + SU.clusteredWithBottom = true; + DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); break; } @@ -2797,6 +2801,41 @@ << ":" << Cand.RPDelta.Excess.getUnitInc() << "\n"); } +static bool tryCluster(GenericSchedulerBase::SchedCandidate &Cand, + GenericSchedulerBase::SchedCandidate &TryCand, + const ScheduleDAGMI &DAG) { + // Keep clustered nodes together to encourage downstream peephole + // optimizations which may reduce resource requirements. + // + // This is a best effort to set things up for a post-RA pass. Optimizations + // like generating loads of multiple registers should ideally be done within + // the scheduler pass by combining the loads during DAG postprocessing. + const SUnit *CandNextClusterSU = + Cand.AtTop ? DAG.getNextClusterSucc() : DAG.getNextClusterPred(); + const SUnit *TryCandNextClusterSU = + TryCand.AtTop ? DAG.getNextClusterSucc() : DAG.getNextClusterPred(); + return tryGreater(TryCand.SU == TryCandNextClusterSU, + Cand.SU == CandNextClusterSU, TryCand, Cand, + GenericScheduler::Cluster); +} + +static bool tryWeak(GenericSchedulerBase::SchedCandidate &Cand, + GenericSchedulerBase::SchedCandidate &TryCand) { + if (Cand.AtTop != TryCand.AtTop) + return false; + + // Weak edges are for clustering and other constraints. + if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), + getWeakLeft(Cand.SU, Cand.AtTop), + TryCand, Cand, GenericScheduler::Weak)) + return true; + + // delay top node when it should be clustered with the bottom boundary. + return TryCand.AtTop && + tryLess(TryCand.SU->clusteredWithBottom, Cand.SU->clusteredWithBottom, + TryCand, Cand, GenericScheduler::Weak); +} + /// Apply a set of heursitics to a new candidate. Heuristics are currently /// hierarchical. This may be more efficient than a graduated cost model because /// we don't need to evaluate all aspects of the model for each node in the @@ -2856,28 +2895,10 @@ return; } - // Keep clustered nodes together to encourage downstream peephole - // optimizations which may reduce resource requirements. - // - // This is a best effort to set things up for a post-RA pass. Optimizations - // like generating loads of multiple registers should ideally be done within - // the scheduler pass by combining the loads during DAG postprocessing. - const SUnit *CandNextClusterSU = - Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - const SUnit *TryCandNextClusterSU = - TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == TryCandNextClusterSU, - Cand.SU == CandNextClusterSU, - TryCand, Cand, Cluster)) + if (tryCluster(Cand, TryCand, *DAG)) + return; + if (tryWeak(Cand, TryCand)) return; - - if (SameBoundary) { - // Weak edges are for clustering and other constraints. - if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), - getWeakLeft(Cand.SU, Cand.AtTop), - TryCand, Cand, Weak)) - return; - } // Avoid increasing the max pressure of the entire region. if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, @@ -3188,6 +3209,11 @@ return; } + if (tryCluster(Cand, TryCand, *DAG)) + return; + if (tryWeak(Cand, TryCand)) + return; + // Prioritize instructions that read unbuffered resources by stall cycles. if (tryLess(Top.getLatencyStallCycles(TryCand.SU), Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) @@ -3267,7 +3293,12 @@ /// Create a generic scheduler with no vreg liveness or DAG mutation passes. static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C) { - return new ScheduleDAGMI(C, make_unique(C), /*IsPostRA=*/true); + ScheduleDAGMI *DAG = + new ScheduleDAGMI(C, make_unique(C), + /*IsPostRA=*/true); + if (EnableMacroFusion) + DAG->addMutation(createMacroFusionDAGMutation(DAG->TII, DAG->TRI)); + return DAG; } //===----------------------------------------------------------------------===// Index: test/CodeGen/AArch64/postmisched-fusion.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/postmisched-fusion.mir @@ -0,0 +1,23 @@ +# RUN: llc -o - %s -mtriple=aarch64-- -mcpu=cyclone -enable-post-misched -run-pass=postmisched | FileCheck %s +# Test that the post machine scheduler respects macro op fusion. +--- | + define void @func0() { ret void } +... +--- +# CHECK-LABEL: name: func0 +# CHECK: %xzr = SUBSXri{{.*}}implicit-def %nzcv +# CHECK-NEXT: Bcc {{.*}}implicit killed %nzcv +name: func0 +body: | + bb.0: + successors: %bb.1, %bb.2 + %x8 = IMPLICIT_DEF + %x9 = LDRXui %x8, 0 :: (load 8) + dead %xzr = SUBSXri %x8, 0, 0, implicit def %nzcv + %x10 = ADDXri %x9, 13, 0 + Bcc 1, %bb.1, implicit killed %nzcv + B %bb.2 + + bb.1: + bb.2: +...