Index: include/llvm/CodeGen/MacroFusion.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/MacroFusion.h @@ -0,0 +1,36 @@ +//===- MacroFusion.h - Macro Fusion ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// \fileThis file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the definition of the DAG scheduling mutation to pair +// instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" + +//===----------------------------------------------------------------------===// +// MacroFusion - DAG post-processing to encourage fusion of macro ops. +//===----------------------------------------------------------------------===// + +namespace llvm { + +typedef std::function ShouldSchedulePredTy; + +/// \brief Create a DAG scheduling mutation to pair instructions back to back +/// for instructions that benefit according to the target-specific +/// shouldScheduleAdjacent predicate function. +std::unique_ptr + createMacroFusionDAGMutation(ShouldSchedulePredTy shouldScheduleAdjacent); + +} // llvm Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -92,6 +92,7 @@ PatchableFunction.cpp MIRPrinter.cpp MIRPrintingPass.cpp + MacroFusion.cpp OptimizePHIs.cpp ParallelCG.cpp PeepholeOptimizer.cpp Index: lib/CodeGen/MacroFusion.cpp =================================================================== --- /dev/null +++ lib/CodeGen/MacroFusion.cpp @@ -0,0 +1,146 @@ +//===- MacroFusion.cpp - Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file This file contains the implementation of the DAG scheduling mutation +// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define DEBUG_TYPE "misched" + +STATISTIC(NumFused, "Number of instr pairs fused"); + +using namespace llvm; + +static cl::opt EnableMacroFusion("misched-fusion", cl::Hidden, + cl::desc("Enable scheduling for macro fusion."), cl::init(true)); + +namespace { + +/// \brief Post-process the DAG to create cluster edges between instrs that may +/// be fused by the processor into a single operation. +class MacroFusion : public ScheduleDAGMutation { + ShouldSchedulePredTy shouldScheduleAdjacent; + bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU); +public: + MacroFusion(ShouldSchedulePredTy shouldScheduleAdjacent) : shouldScheduleAdjacent(shouldScheduleAdjacent) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + // For each of the SUnits in the scheduling block, try to fuse the instr in it + // with one in its successors. + for (SUnit &ISU : DAG->SUnits) + scheduleAdjacentImpl(DAG, ISU); + + // Try to fuse the instr in the ExitSU with one in its predecessors. + scheduleAdjacentImpl(DAG, DAG->ExitSU); +} + +/// \brief Implement the fusion of instr pairs in the scheduling DAG, +/// anchored at the instr in AnchorSU.. +bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) { + const MachineInstr *AnchorMI = AnchorSU.getInstr(); + if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient()) + return false; + + // If the anchor instr is the ExitSU, then consider its predecessors; + // otherwise, its successors. + bool Preds = (&AnchorSU == &DAG->ExitSU); + SmallVectorImpl &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs; + + const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI; + const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr; + + // Check if the anchor instr may be fused. + if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), + FirstMI, SecondMI)) + return false; + + // Explorer for fusion candidates among the dependencies of the anchor instr. + for (SDep &Dep : AnchorDeps) { + // Ignore dependencies that don't enforce ordering. + if (Dep.isWeak()) + continue; + + SUnit &DepSU = *Dep.getSUnit(); + // Ignore the ExitSU if the dependents are successors. + if (!Preds && &DepSU == &DAG->ExitSU) + continue; + + const MachineInstr *DepMI = DepSU.getInstr(); + if (!DepMI || DepMI->isPseudo() || DepMI->isTransient()) + continue; + + FirstMI = Preds ? DepMI : AnchorMI; + SecondMI = Preds ? AnchorMI : DepMI; + if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), + FirstMI, SecondMI)) + continue; + + // Create a single weak edge between the adjacent instrs. The only effect is + // to cause bottom-up scheduling to heavily prioritize the clustered instrs. + SUnit &FirstSU = Preds ? DepSU : AnchorSU; + SUnit &SecondSU = Preds ? AnchorSU : DepSU; + DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)); + + // Adjust the latency between the anchor instr and its + // predecessors/successors. + for (SDep &IDep : AnchorDeps) + if (IDep.getSUnit() == &DepSU) + IDep.setLatency(0); + + // Adjust the latency between the dependent instr and its + // successors/predecessors. + for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds) + if (IDep.getSUnit() == &AnchorSU) + IDep.setLatency(0); + + DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; + FirstSU.print(dbgs(), DAG); dbgs() << " - "; + SecondSU.print(dbgs(), DAG); dbgs() << " / "; + dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " << + DAG->TII->getName(SecondMI->getOpcode()) << '\n'; ); + + if (&SecondSU != &DAG->ExitSU) + // Make instructions dependent on FirstSU also dependent on SecondSU to + // prevent them from being scheduled between FirstSU and and SecondSU. + for (const SDep &SI : FirstSU.Succs) { + if (SI.getSUnit() == &SecondSU) + continue; + DEBUG(dbgs() << " Copy Succ "; + SI.getSUnit()->print(dbgs(), DAG); dbgs() << '\n';); + DAG->addEdge(SI.getSUnit(), SDep(&SecondSU, SDep::Artificial)); + } + + ++NumFused; + return true; + } + + return false; +} + +} // end namespace + + +namespace llvm { + +std::unique_ptr createMacroFusionDAGMutation (ShouldSchedulePredTy shouldScheduleAdjacent) { + return EnableMacroFusion ? make_unique(shouldScheduleAdjacent) : nullptr; +} + +} // end namespace llvm Index: lib/Target/AArch64/AArch64MacroFusion.cpp =================================================================== --- lib/Target/AArch64/AArch64MacroFusion.cpp +++ lib/Target/AArch64/AArch64MacroFusion.cpp @@ -15,19 +15,12 @@ #include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/CodeGen/MacroFusion.h" #include "llvm/Target/TargetInstrInfo.h" -#define DEBUG_TYPE "misched" - -STATISTIC(NumFused, "Number of instr pairs fused"); - using namespace llvm; -static cl::opt EnableMacroFusion("aarch64-misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - -namespace { +namespace llvm { /// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused /// together. Given an anchor instr, when the other instr is unspecified, then @@ -167,119 +160,8 @@ return false; } -/// \brief Implement the fusion of instr pairs in the scheduling DAG, -/// anchored at the instr in AnchorSU.. -static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) { - const MachineInstr *AnchorMI = AnchorSU.getInstr(); - if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient()) - return false; - - // If the anchor instr is the ExitSU, then consider its predecessors; - // otherwise, its successors. - bool Preds = (&AnchorSU == &DAG->ExitSU); - SmallVectorImpl &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs; - - const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI; - const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr; - - // Check if the anchor instr may be fused. - if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), - FirstMI, SecondMI)) - return false; - - // Explorer for fusion candidates among the dependencies of the anchor instr. - for (SDep &Dep : AnchorDeps) { - // Ignore dependencies that don't enforce ordering. - if (Dep.isWeak()) - continue; - - SUnit &DepSU = *Dep.getSUnit(); - // Ignore the ExitSU if the dependents are successors. - if (!Preds && &DepSU == &DAG->ExitSU) - continue; - - const MachineInstr *DepMI = DepSU.getInstr(); - if (!DepMI || DepMI->isPseudo() || DepMI->isTransient()) - continue; - - FirstMI = Preds ? DepMI : AnchorMI; - SecondMI = Preds ? AnchorMI : DepMI; - if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), - FirstMI, SecondMI)) - continue; - - // Create a single weak edge between the adjacent instrs. The only effect is - // to cause bottom-up scheduling to heavily prioritize the clustered instrs. - SUnit &FirstSU = Preds ? DepSU : AnchorSU; - SUnit &SecondSU = Preds ? AnchorSU : DepSU; - DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)); - - // Adjust the latency between the anchor instr and its - // predecessors/successors. - for (SDep &IDep : AnchorDeps) - if (IDep.getSUnit() == &DepSU) - IDep.setLatency(0); - - // Adjust the latency between the dependent instr and its - // successors/predecessors. - for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds) - if (IDep.getSUnit() == &AnchorSU) - IDep.setLatency(0); - - DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; - FirstSU.print(dbgs(), DAG); dbgs() << " - "; - SecondSU.print(dbgs(), DAG); dbgs() << " / "; - dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " << - DAG->TII->getName(SecondMI->getOpcode()) << '\n'; ); - - if (&SecondSU != &DAG->ExitSU) - // Make instructions dependent on FirstSU also dependent on SecondSU to - // prevent them from being scheduled between FirstSU and and SecondSU. - for (SUnit::const_succ_iterator - SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end(); - SI != SE; ++SI) { - if (!SI->getSUnit() || SI->getSUnit() == &SecondSU) - continue; - DEBUG(dbgs() << " Copy Succ "; - SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';); - DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial)); - } - - ++NumFused; - return true; - } - - return false; -} - -/// \brief Post-process the DAG to create cluster edges between instrs that may -/// be fused by the processor into a single operation. -class AArch64MacroFusion : public ScheduleDAGMutation { -public: - AArch64MacroFusion() {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; - -void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast(DAGInstrs); - - // For each of the SUnits in the scheduling block, try to fuse the instr in it - // with one in its successors. - for (SUnit &ISU : DAG->SUnits) - scheduleAdjacentImpl(DAG, ISU); - - // Try to fuse the instr in the ExitSU with one in its predecessors. - scheduleAdjacentImpl(DAG, DAG->ExitSU); -} - -} // end namespace - - -namespace llvm { - std::unique_ptr createAArch64MacroFusionDAGMutation () { - return EnableMacroFusion ? make_unique() : nullptr; + return createMacroFusionDAGMutation(shouldScheduleAdjacent); } } // end namespace llvm Index: lib/Target/X86/X86MacroFusion.cpp =================================================================== --- lib/Target/X86/X86MacroFusion.cpp +++ lib/Target/X86/X86MacroFusion.cpp @@ -14,27 +14,19 @@ #include "X86MacroFusion.h" #include "X86Subtarget.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" - -#define DEBUG_TYPE "misched" - -STATISTIC(NumFused, "Number of instr pairs fused"); +#include "llvm/CodeGen/MacroFusion.h" using namespace llvm; -static cl::opt EnableMacroFusion("x86-misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - -namespace { - /// \brief Verify that the instruction pair, First and Second, /// should be scheduled back to back. If either instruction is unspecified, /// then verify that the other instruction may be part of a pair at all. -static bool shouldScheduleAdjacent(const X86Subtarget &ST, +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, const MachineInstr *First, const MachineInstr *Second) { + const X86Subtarget &ST = static_cast(TSI); // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. @@ -203,69 +195,11 @@ } } -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. -class X86MacroFusion : public ScheduleDAGMutation { -public: - X86MacroFusion() {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; - -void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast(DAGInstrs); - const X86Subtarget &ST = DAG->MF.getSubtarget(); - - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch)) - return; - - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) - continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!shouldScheduleAdjacent(ST, &Pred, Branch)) - continue; - - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - for (SDep &SuccDep : SU.Succs) - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - - ++NumFused; - DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; - SU.print(dbgs(), DAG); - dbgs() << " - ExitSU" - << " / " << DAG->TII->getName(Pred.getOpcode()) << " - " - << DAG->TII->getName(Branch->getOpcode()) << '\n';); - - break; - } -} - -} // end namespace - namespace llvm { std::unique_ptr createX86MacroFusionDAGMutation () { - return EnableMacroFusion ? make_unique() : nullptr; + return createMacroFusionDAGMutation(shouldScheduleAdjacent); } } // end namespace llvm