Index: llvm/include/llvm/CodeGen/MacroFusion.h =================================================================== --- llvm/include/llvm/CodeGen/MacroFusion.h +++ llvm/include/llvm/CodeGen/MacroFusion.h @@ -30,7 +30,8 @@ using ShouldSchedulePredTy = std::function; + const MachineInstr &SecondMI, + unsigned NumFused)>; /// Create a DAG scheduling mutation to pair instructions back to back /// for instructions that benefit according to the target-specific Index: llvm/lib/CodeGen/MacroFusion.cpp =================================================================== --- llvm/lib/CodeGen/MacroFusion.cpp +++ llvm/lib/CodeGen/MacroFusion.cpp @@ -36,6 +36,31 @@ return Dep.getKind() == SDep::Anti || Dep.getKind() == SDep::Output; } +namespace { + +static SUnit *getPredClusterSU(const SUnit &SU) { + for (const SDep &SI : SU.Preds) + if (SI.isCluster()) + return SI.getSUnit(); + + return nullptr; +} + +static SUnit *getSuccClusterSU(const SUnit &SU) { + for (const SDep &SI : SU.Succs) + if (SI.isCluster()) + return SI.getSUnit(); + + return nullptr; +} + +static unsigned getNumOfClusterSU(const SUnit &SU) { + unsigned Num = 0; + const SUnit *CurrentSU = &SU; + while ((CurrentSU = getPredClusterSU(*CurrentSU))) Num ++; + return Num; +} + static bool fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU, SUnit &SecondSU) { // Check that neither instr is already paired with another along the edge @@ -73,27 +98,34 @@ // Make data dependencies from the FirstSU also dependent on the SecondSU to // prevent them from being scheduled between the FirstSU and the SecondSU. - if (&SecondSU != &DAG.ExitSU) + SUnit *CurrentSU = &SecondSU; + while (CurrentSU && CurrentSU != &DAG.ExitSU) { for (const SDep &SI : FirstSU.Succs) { SUnit *SU = SI.getSUnit(); if (SI.isWeak() || isHazard(SI) || - SU == &DAG.ExitSU || SU == &SecondSU || SU->isPred(&SecondSU)) + SU == &DAG.ExitSU || SU == CurrentSU || + SU->isPred(CurrentSU)) continue; - LLVM_DEBUG(dbgs() << " Bind "; DAG.dumpNodeName(SecondSU); + LLVM_DEBUG(dbgs() << " Bind "; DAG.dumpNodeName(*CurrentSU); dbgs() << " - "; DAG.dumpNodeName(*SU); dbgs() << '\n';); - DAG.addEdge(SU, SDep(&SecondSU, SDep::Artificial)); + DAG.addEdge(SU, SDep(CurrentSU, SDep::Artificial)); } + CurrentSU = getSuccClusterSU(*CurrentSU); + } + // Make the FirstSU also dependent on the dependencies of the SecondSU to // prevent them from being scheduled between the FirstSU and the SecondSU. - if (&FirstSU != &DAG.EntrySU) { + CurrentSU = &FirstSU; + while (CurrentSU && CurrentSU != &DAG.EntrySU) { for (const SDep &SI : SecondSU.Preds) { SUnit *SU = SI.getSUnit(); - if (SI.isWeak() || isHazard(SI) || &FirstSU == SU || FirstSU.isSucc(SU)) + if (SI.isWeak() || isHazard(SI) || CurrentSU == SU || + CurrentSU->isSucc(SU)) continue; LLVM_DEBUG(dbgs() << " Bind "; DAG.dumpNodeName(*SU); dbgs() << " - "; - DAG.dumpNodeName(FirstSU); dbgs() << '\n';); - DAG.addEdge(&FirstSU, SDep(SU, SDep::Artificial)); + DAG.dumpNodeName(*CurrentSU); dbgs() << '\n';); + DAG.addEdge(CurrentSU, SDep(SU, SDep::Artificial)); } // ExitSU comes last by design, which acts like an implicit dependency // between ExitSU and any bottom root in the graph. We should transfer @@ -101,17 +133,17 @@ if (&SecondSU == &DAG.ExitSU) { for (SUnit &SU : DAG.SUnits) { if (SU.Succs.empty()) - DAG.addEdge(&FirstSU, SDep(&SU, SDep::Artificial)); + DAG.addEdge(CurrentSU, SDep(&SU, SDep::Artificial)); } } + + CurrentSU = getPredClusterSU(*CurrentSU); } ++NumFused; return true; } -namespace { - /// Post-process the DAG to create cluster edges between instrs that may /// be fused by the processor into a single operation. class MacroFusion : public ScheduleDAGMutation { @@ -148,7 +180,7 @@ const TargetSubtargetInfo &ST = DAG.MF.getSubtarget(); // Check if the anchor instr may be fused. - if (!shouldScheduleAdjacent(TII, ST, nullptr, AnchorMI)) + if (!shouldScheduleAdjacent(TII, ST, nullptr, AnchorMI, 0)) return false; // Explorer for fusion candidates among the dependencies of the anchor instr. @@ -162,7 +194,8 @@ continue; const MachineInstr *DepMI = DepSU.getInstr(); - if (!shouldScheduleAdjacent(TII, ST, DepMI, AnchorMI)) + if (!shouldScheduleAdjacent(TII, ST, DepMI, AnchorMI, + getNumOfClusterSU(DepSU))) continue; if (fuseInstructionPair(DAG, DepSU, AnchorSU)) Index: llvm/lib/Target/AArch64/AArch64MacroFusion.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MacroFusion.cpp +++ llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -375,7 +375,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, const TargetSubtargetInfo &TSI, const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { + const MachineInstr &SecondMI, + unsigned NumFused) { + // Only back to back fusion are supported. + if (NumFused > 0) + return false; + const AArch64Subtarget &ST = static_cast(TSI); // All checking functions assume that the 1st instr is a wildcard if it is Index: llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -28,7 +28,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, const TargetSubtargetInfo &TSI, const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { + const MachineInstr &SecondMI, + unsigned NumFused) { + // Only back to back fusion are supported. + if (NumFused > 0) + return false; + const SIInstrInfo &TII = static_cast(TII_); switch (SecondMI.getOpcode()) { Index: llvm/lib/Target/ARM/ARMMacroFusion.cpp =================================================================== --- llvm/lib/Target/ARM/ARMMacroFusion.cpp +++ llvm/lib/Target/ARM/ARMMacroFusion.cpp @@ -51,7 +51,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, const TargetSubtargetInfo &TSI, const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { + const MachineInstr &SecondMI, + unsigned NumFused) { + // Only back to back fusion are supported. + if (NumFused > 0) + return false; + const ARMSubtarget &ST = static_cast(TSI); if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI)) Index: llvm/lib/Target/X86/X86MacroFusion.cpp =================================================================== --- llvm/lib/Target/X86/X86MacroFusion.cpp +++ llvm/lib/Target/X86/X86MacroFusion.cpp @@ -180,7 +180,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, const TargetSubtargetInfo &TSI, const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { + const MachineInstr &SecondMI, + unsigned NumFused) { + // Only back to back fusion are supported. + if (NumFused > 0) + return false; + const X86Subtarget &ST = static_cast(TSI); // Check if this processor supports any kind of fusion. Index: llvm/test/CodeGen/AArch64/macro-fusion-verify.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/macro-fusion-verify.ll @@ -0,0 +1,40 @@ +; REQUIRES: asserts +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fuse-arith-logic -verify-misched -debug-only=machine-scheduler 2>&1 > /dev/null | FileCheck %s + +; Verify that, the macro-fusion won't bring in extra dependency. +define signext i32 @test(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: %bb.0 entry +; CHECK: Macro fuse: SU([[SU4:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU0:[0-9]+]]): %{{[0-9]+}}:gpr32 = COPY $w3 +; CHECK: SU([[SU1:[0-9]+]]): %{{[0-9]+}}:gpr32 = COPY $w2 +; CHECK: SU([[SU2:[0-9]+]]): %{{[0-9]+}}:gpr32 = COPY $w1 +; CHECK: SU([[SU3:[0-9]+]]): %{{[0-9]+}}:gpr32 = COPY $w0 + +; Because SU(4) and SU(5) are cluster, SU(4) has the predecessor SU(1), +; which is the predecessor of SU(5), to make sure that, SU(1) cannot +; be scheduled in between SU(4) and SU(5) +; CHECK: SU([[SU4:[0-9]+]]): %{{[0-9]+}}:gpr32 = nsw ADDWrr +; CHECK: Predecessors: +; CHECK-DAG: SU([[SU3]]): +; CHECK-DAG: SU([[SU2]]): +; CHECK-DAG: SU([[SU1]]): +; CHECK-NOT: SU([[SU0]]) +; CHECK: Successors: +; CHECK: SU([[SU5]]): Ord Latency=0 Cluster + +; SU(0) has nothing to do with SU(4) and SU(5). They shouldn't have +; any dependency. +; CHECK: SU([[SU5]]): %{{[0-9]+}}:gpr32 = nsw ADDWrr +; CHECK: Predecessors: +; CHECK-DAG: SU([[SU1]]) +; CHECK-DAG: SU([[SU4]]) +; CHECK-NOT: SU([[SU0]]) +; CHECK: Successors: + + %add = add nsw i32 %b, %a + %add1 = add nsw i32 %add, %c + %sub = sub nsw i32 %add1, %d + ret i32 %sub +}