Index: llvm/include/llvm/Target/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/Target/TargetInstrInfo.h +++ llvm/include/llvm/Target/TargetInstrInfo.h @@ -39,6 +39,7 @@ class ScheduleHazardRecognizer; class SelectionDAG; class ScheduleDAG; +class ScheduleDAGInstrs; class TargetRegisterClass; class TargetRegisterInfo; class TargetSubtargetInfo; @@ -1070,13 +1071,12 @@ llvm_unreachable("target did not implement shouldClusterMemOps()"); } - /// Can this target fuse the given instructions if they are scheduled - /// adjacent. Note that you have to add: + /// Override the fusion of instructions in the given scheduling block. + /// Note that you have to add: /// DAG.addMutation(createMacroFusionDAGMutation()); /// to TargetPassConfig::createMachineScheduler() to have an effect. - virtual bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const { - llvm_unreachable("target did not implement shouldScheduleAdjacent()"); + virtual void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const { + llvm_unreachable("target did not implement scheduleAdjacent()"); } /// Reverses the branch condition of the specified condition list, Index: llvm/lib/CodeGen/MachineScheduler.cpp =================================================================== --- llvm/lib/CodeGen/MachineScheduler.cpp +++ llvm/lib/CodeGen/MachineScheduler.cpp @@ -1555,7 +1555,9 @@ MacroFusion(const TargetInstrInfo &TII) : TII(TII) {} - void apply(ScheduleDAGInstrs *DAGInstrs) override; + void apply(ScheduleDAGInstrs *DAGInstrs) override { + TII.scheduleAdjacent(DAGInstrs); + } }; } // anonymous @@ -1568,50 +1570,6 @@ } // namespace llvm -/// \brief Callback from DAG postProcessing to create cluster edges to encourage -/// fused operations. -void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast(DAGInstrs); - - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch) - return; - - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) - continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!TII.shouldScheduleAdjacent(Pred, *Branch)) - continue; - - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - } - for (SDep &SuccDep : SU.Succs) { - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - } - - DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); - break; - } -} - //===----------------------------------------------------------------------===// // CopyConstrain - DAG post-processing to encourage copy elimination. //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -136,8 +136,7 @@ bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; + void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const override; MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/GlobalValue.h" @@ -1903,13 +1904,23 @@ return Offset1 + 1 == Offset2; } -bool AArch64InstrInfo::shouldScheduleAdjacent( - const MachineInstr &First, const MachineInstr &Second) const { - if (Subtarget.hasArithmeticBccFusion()) { +/// \brief Verify that the instruction pair, \param First and \param Second, +/// should be scheduled back to back. Given an anchor instruction, if the other +/// instruction is unspecified, then verify that the anchor instruction may be +/// part of a pair at all. +static bool shouldScheduleAdjacent(const AArch64InstrInfo &TII, + const AArch64Subtarget &ST, + const MachineInstr *First, + const MachineInstr *Second) { + unsigned FirstOpcode = First ? + First->getOpcode() : AArch64::INSTRUCTION_LIST_END; + unsigned SecondOpcode = Second ? + Second->getOpcode() : AArch64::INSTRUCTION_LIST_END; + + if (ST.hasArithmeticBccFusion()) // Fuse CMN, CMP, TST followed by Bcc. - unsigned SecondOpcode = Second.getOpcode(); - if (SecondOpcode == AArch64::Bcc) { - switch (First.getOpcode()) { + if (SecondOpcode == AArch64::Bcc) + switch (FirstOpcode) { default: return false; case AArch64::ADDSWri: @@ -1936,16 +1947,16 @@ case AArch64::BICSWrs: case AArch64::BICSXrs: // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); + return !TII.hasShiftedReg(*First); + case AArch64::INSTRUCTION_LIST_END: + return true; } - } - } - if (Subtarget.hasArithmeticCbzFusion()) { + + if (ST.hasArithmeticCbzFusion()) // Fuse ALU operations followed by CBZ/CBNZ. - unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || - SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { - switch (First.getOpcode()) { + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) + switch (FirstOpcode) { default: return false; case AArch64::ADDWri: @@ -1978,13 +1989,80 @@ case AArch64::BICWrs: case AArch64::BICXrs: // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); + return !TII.hasShiftedReg(*First); + case AArch64::INSTRUCTION_LIST_END: + return true; } - } + + return false; +} + +/// \brief Implement the fusion of instruction pairs in the scheduling +/// \param DAG, anchored at the instruction in \param ASU. \param Preds +/// indicates if its dependencies in \param APreds are predecessors instead of +/// successors. +static bool scheduleAdjacentImpl(const AArch64InstrInfo &TII, + const AArch64Subtarget &ST, + ScheduleDAGMI *DAG, SUnit *ASU, + SmallVectorImpl &APreds, bool Preds) { + const MachineInstr *AMI = ASU->getInstr(); + if (!AMI || AMI->isPseudo() || AMI->isTransient() || + (Preds && !shouldScheduleAdjacent(TII, ST, nullptr, AMI)) || + (!Preds && !shouldScheduleAdjacent(TII, ST, AMI, nullptr))) + return false; + + for (SDep &BDep : APreds) { + if (BDep.isWeak()) + continue; + + SUnit *BSU = BDep.getSUnit(); + const MachineInstr *BMI = BSU->getInstr(); + if (!BMI || BMI->isPseudo() || BMI->isTransient() || + (Preds && !shouldScheduleAdjacent(TII, ST, BMI, AMI)) || + (!Preds && !shouldScheduleAdjacent(TII, ST, AMI, BMI))) + continue; + + // Create a single weak edge between the adjacent instrs. The only + // effect is to cause bottom-up scheduling to heavily prioritize the + // clustered instrs. + if (Preds) + DAG->addEdge(ASU, SDep(BSU, SDep::Cluster)); + else + DAG->addEdge(BSU, SDep(ASU, SDep::Cluster)); + + // Adjust the latency between the 1st instr and its predecessors/successors. + for (SDep &Dep : APreds) + if (Dep.getSUnit() == BSU) + Dep.setLatency(0); + + // Adjust the latency between the 2nd instr and its successors/predecessors. + auto &BSuccs = Preds ? BSU->Succs : BSU->Preds; + for (SDep &Dep : BSuccs) + if (Dep.getSUnit() == ASU) + Dep.setLatency(0); + + return true; } + return false; } +/// \brief Callback from DAG postProcessing to create cluster edges to encourage +/// fused operations. +void +AArch64InstrInfo::scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + // For each of the SUnits in the scheduling block, try to fuse the instruction + // in it with one in its successors. + for (SUnit &ASU : DAG->SUnits) + scheduleAdjacentImpl(*this, Subtarget, DAG, &ASU, ASU.Succs, false); + + // Try to fuse the instruction in the ExitSU with one in its predecessors. + scheduleAdjacentImpl(*this, Subtarget, + DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true); +} + MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, const MDNode *Expr, const DebugLoc &DL) const { Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -443,8 +443,7 @@ int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; + void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const override; void getNoopForMachoTarget(MCInst &NopInst) const override; Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -8294,8 +8295,12 @@ return true; } -bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const { +/// \brief Verify that the instruction pair, \param First and \param Second, +/// should be scheduled back to back. If either instruction is unspecified, +/// then verify that the other instruction may be part of a pair at all. +static bool shouldScheduleAdjacent(const X86Subtarget &Subtarget, + const MachineInstr *First, + const MachineInstr *Second) { // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. @@ -8308,7 +8313,12 @@ FuseInc } FuseKind; - switch (Second.getOpcode()) { + unsigned FirstOpcode = First ? + First->getOpcode() : X86::INSTRUCTION_LIST_END; + unsigned SecondOpcode = Second ? + Second->getOpcode() : X86::INSTRUCTION_LIST_END; + + switch (SecondOpcode) { default: return false; case X86::JE_1: @@ -8334,7 +8344,8 @@ FuseKind = FuseTest; break; } - switch (First.getOpcode()) { + + switch (FirstOpcode) { default: return false; case X86::TEST8rr: @@ -8450,6 +8461,49 @@ case X86::DEC64r: case X86::DEC8r: return FuseKind == FuseInc; + case X86::INSTRUCTION_LIST_END: + return true; + } +} + +void X86InstrInfo::scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + // For now, assume targets can only fuse with the branch. + SUnit &ExitSU = DAG->ExitSU; + MachineInstr *Branch = ExitSU.getInstr(); + if (!shouldScheduleAdjacent(Subtarget, nullptr, Branch)) + return; + + for (SDep &PredDep : ExitSU.Preds) { + if (PredDep.isWeak()) + continue; + SUnit &SU = *PredDep.getSUnit(); + MachineInstr &Pred = *SU.getInstr(); + if (!shouldScheduleAdjacent(Subtarget, &Pred, Branch)) + continue; + + // Create a single weak edge from SU to ExitSU. The only effect is to cause + // bottom-up scheduling to heavily prioritize the clustered SU. There is no + // need to copy predecessor edges from ExitSU to SU, since top-down + // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling + // of SU, we could create an artificial edge from the deepest root, but it + // hasn't been needed yet. + bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); + (void)Success; + assert(Success && "No DAG nodes should be reachable from ExitSU"); + + // Adjust latency of data deps between the nodes. + for (SDep &PredDep : ExitSU.Preds) { + if (PredDep.getSUnit() == &SU) + PredDep.setLatency(0); + } + for (SDep &SuccDep : SU.Succs) { + if (SuccDep.getSUnit() == &ExitSU) + SuccDep.setLatency(0); + } + + break; } }