Index: llvm/include/llvm/CodeGen/MachineScheduler.h =================================================================== --- llvm/include/llvm/CodeGen/MachineScheduler.h +++ llvm/include/llvm/CodeGen/MachineScheduler.h @@ -1033,9 +1033,6 @@ const TargetRegisterInfo *TRI); std::unique_ptr -createMacroFusionDAGMutation(const TargetInstrInfo *TII); - -std::unique_ptr createCopyConstrainDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI); Index: llvm/include/llvm/Target/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/Target/TargetInstrInfo.h +++ llvm/include/llvm/Target/TargetInstrInfo.h @@ -1070,15 +1070,6 @@ llvm_unreachable("target did not implement shouldClusterMemOps()"); } - /// Can this target fuse the given instructions if they are scheduled - /// adjacent. Note that you have to add: - /// DAG.addMutation(createMacroFusionDAGMutation()); - /// to TargetPassConfig::createMachineScheduler() to have an effect. - virtual bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const { - llvm_unreachable("target did not implement shouldScheduleAdjacent()"); - } - /// Reverses the branch condition of the specified condition list, /// returning false on success and true if it cannot be reversed. virtual Index: llvm/lib/CodeGen/MachineScheduler.cpp =================================================================== --- llvm/lib/CodeGen/MachineScheduler.cpp +++ llvm/lib/CodeGen/MachineScheduler.cpp @@ -80,10 +80,6 @@ cl::desc("Enable memop clustering."), cl::init(true)); -// Experimental heuristics -static cl::opt EnableMacroFusion("misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - static cl::opt VerifyScheduling("verify-misched", cl::Hidden, cl::desc("Verify machine instrs before and after machine scheduling")); @@ -1543,76 +1539,6 @@ } //===----------------------------------------------------------------------===// -// MacroFusion - DAG post-processing to encourage fusion of macro ops. -//===----------------------------------------------------------------------===// - -namespace { -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. -class MacroFusion : public ScheduleDAGMutation { - const TargetInstrInfo &TII; -public: - MacroFusion(const TargetInstrInfo &TII) - : TII(TII) {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; -} // anonymous - -namespace llvm { - -std::unique_ptr -createMacroFusionDAGMutation(const TargetInstrInfo *TII) { - return EnableMacroFusion ? make_unique(*TII) : nullptr; -} - -} // namespace llvm - -/// \brief Callback from DAG postProcessing to create cluster edges to encourage -/// fused operations. -void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast(DAGInstrs); - - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch) - return; - - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) - continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!TII.shouldScheduleAdjacent(Pred, *Branch)) - continue; - - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - } - for (SDep &SuccDep : SU.Succs) { - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - } - - DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); - break; - } -} - -//===----------------------------------------------------------------------===// // CopyConstrain - DAG post-processing to encourage copy elimination. //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -136,9 +136,6 @@ bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, const MDNode *Expr, Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1903,88 +1903,6 @@ return Offset1 + 1 == Offset2; } -bool AArch64InstrInfo::shouldScheduleAdjacent( - const MachineInstr &First, const MachineInstr &Second) const { - if (Subtarget.hasArithmeticBccFusion()) { - // Fuse CMN, CMP, TST followed by Bcc. - unsigned SecondOpcode = Second.getOpcode(); - if (SecondOpcode == AArch64::Bcc) { - switch (First.getOpcode()) { - default: - return false; - case AArch64::ADDSWri: - case AArch64::ADDSWrr: - case AArch64::ADDSXri: - case AArch64::ADDSXrr: - case AArch64::ANDSWri: - case AArch64::ANDSWrr: - case AArch64::ANDSXri: - case AArch64::ANDSXrr: - case AArch64::SUBSWri: - case AArch64::SUBSWrr: - case AArch64::SUBSXri: - case AArch64::SUBSXrr: - case AArch64::BICSWrr: - case AArch64::BICSXrr: - return true; - case AArch64::ADDSWrs: - case AArch64::ADDSXrs: - case AArch64::ANDSWrs: - case AArch64::ANDSXrs: - case AArch64::SUBSWrs: - case AArch64::SUBSXrs: - case AArch64::BICSWrs: - case AArch64::BICSXrs: - // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); - } - } - } - if (Subtarget.hasArithmeticCbzFusion()) { - // Fuse ALU operations followed by CBZ/CBNZ. - unsigned SecondOpcode = Second.getOpcode(); - if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || - SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { - switch (First.getOpcode()) { - default: - return false; - case AArch64::ADDWri: - case AArch64::ADDWrr: - case AArch64::ADDXri: - case AArch64::ADDXrr: - case AArch64::ANDWri: - case AArch64::ANDWrr: - case AArch64::ANDXri: - case AArch64::ANDXrr: - case AArch64::EORWri: - case AArch64::EORWrr: - case AArch64::EORXri: - case AArch64::EORXrr: - case AArch64::ORRWri: - case AArch64::ORRWrr: - case AArch64::ORRXri: - case AArch64::ORRXrr: - case AArch64::SUBWri: - case AArch64::SUBWrr: - case AArch64::SUBXri: - case AArch64::SUBXrr: - return true; - case AArch64::ADDWrs: - case AArch64::ADDXrs: - case AArch64::ANDWrs: - case AArch64::ANDXrs: - case AArch64::SUBWrs: - case AArch64::SUBXrs: - case AArch64::BICWrs: - case AArch64::BICXrs: - // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); - } - } - } - return false; -} - MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, const MDNode *Expr, const DebugLoc &DL) const { Index: llvm/lib/Target/AArch64/AArch64MacroFusion.h =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64MacroFusion.h @@ -0,0 +1,38 @@ +//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 definition of the DAG scheduling mutation +// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" + +//===----------------------------------------------------------------------===// +// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops. +//===----------------------------------------------------------------------===// + +namespace llvm { + +/// \brief Post-process the DAG to create cluster edges between instructions +/// that may be fused by the processor into a single operation. +class AArch64MacroFusion : public ScheduleDAGMutation { +public: + AArch64MacroFusion() {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +/// Note that you have to add: +/// DAG.addMutation(createAArch64MacroFusionDAGMutation()); +/// to AArch64PassConfig::createMachineScheduler() to have an effect. +std::unique_ptr createAArch64MacroFusionDAGMutation(); + +} // llvm Index: llvm/lib/Target/AArch64/AArch64MacroFusion.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -0,0 +1,195 @@ +//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the DAG scheduling mutation +// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MacroFusion.h" +#include "AArch64Subtarget.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +static cl::opt EnableMacroFusion("aarch64-misched-fusion", cl::Hidden, + cl::desc("Enable scheduling for macro fusion."), cl::init(true)); + +namespace llvm { + +/// \brief Verify that the instruction pair, \param First and \param Second, +/// should be scheduled back to back. Given an anchor instruction, if the other +/// instruction is unspecified, then verify that the anchor instruction may be +/// part of a pair at all. +static bool shouldScheduleAdjacent(const AArch64InstrInfo &TII, + const AArch64Subtarget &ST, + const MachineInstr *First, + const MachineInstr *Second) { + unsigned FirstOpcode = First ? + First->getOpcode() : AArch64::INSTRUCTION_LIST_END; + unsigned SecondOpcode = Second ? + Second->getOpcode() : AArch64::INSTRUCTION_LIST_END; + + if (ST.hasArithmeticBccFusion()) + // Fuse CMN, CMP, TST followed by Bcc. + if (SecondOpcode == AArch64::Bcc) + switch (FirstOpcode) { + default: + return false; + case AArch64::ADDSWri: + case AArch64::ADDSWrr: + case AArch64::ADDSXri: + case AArch64::ADDSXrr: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::SUBSWri: + case AArch64::SUBSWrr: + case AArch64::SUBSXri: + case AArch64::SUBSXrr: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + return true; + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !TII.hasShiftedReg(*First); + case AArch64::INSTRUCTION_LIST_END: + return true; + } + + if (ST.hasArithmeticCbzFusion()) + // Fuse ALU operations followed by CBZ/CBNZ. + if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) + switch (FirstOpcode) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDWrr: + case AArch64::ADDXri: + case AArch64::ADDXrr: + case AArch64::ANDWri: + case AArch64::ANDWrr: + case AArch64::ANDXri: + case AArch64::ANDXrr: + case AArch64::EORWri: + case AArch64::EORWrr: + case AArch64::EORXri: + case AArch64::EORXrr: + case AArch64::ORRWri: + case AArch64::ORRWrr: + case AArch64::ORRXri: + case AArch64::ORRXrr: + case AArch64::SUBWri: + case AArch64::SUBWrr: + case AArch64::SUBXri: + case AArch64::SUBXrr: + return true; + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !TII.hasShiftedReg(*First); + case AArch64::INSTRUCTION_LIST_END: + return true; + } + + return false; +} + +/// \brief Implement the fusion of instruction pairs in the scheduling +/// \param DAG, anchored at the instruction in \param ASU. \param Preds +/// indicates if its dependencies in \param APreds are predecessors instead of +/// successors. +static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit *ASU, + SmallVectorImpl &APreds, bool Preds) { + const AArch64InstrInfo *TII = static_cast(DAG->TII); + const AArch64Subtarget &ST = DAG->MF.getSubtarget(); + + const MachineInstr *AMI = ASU->getInstr(); + if (!AMI || AMI->isPseudo() || AMI->isTransient() || + (Preds && !shouldScheduleAdjacent(*TII, ST, nullptr, AMI)) || + (!Preds && !shouldScheduleAdjacent(*TII, ST, AMI, nullptr))) + return false; + + for (SDep &BDep : APreds) { + if (BDep.isWeak()) + continue; + + SUnit *BSU = BDep.getSUnit(); + const MachineInstr *BMI = BSU->getInstr(); + if (!BMI || BMI->isPseudo() || BMI->isTransient() || + (Preds && !shouldScheduleAdjacent(*TII, ST, BMI, AMI)) || + (!Preds && !shouldScheduleAdjacent(*TII, ST, AMI, BMI))) + continue; + + // Create a single weak edge between the adjacent instrs. The only + // effect is to cause bottom-up scheduling to heavily prioritize the + // clustered instrs. + if (Preds) + DAG->addEdge(ASU, SDep(BSU, SDep::Cluster)); + else + DAG->addEdge(BSU, SDep(ASU, SDep::Cluster)); + + // Adjust the latency between the 1st instr and its predecessors/successors. + for (SDep &Dep : APreds) + if (Dep.getSUnit() == BSU) + Dep.setLatency(0); + + // Adjust the latency between the 2nd instr and its successors/predecessors. + auto &BSuccs = Preds ? BSU->Succs : BSU->Preds; + for (SDep &Dep : BSuccs) + if (Dep.getSUnit() == ASU) + Dep.setLatency(0); + + DEBUG(dbgs() << "Macro fuse "; + Preds ? BSU->print(dbgs(), DAG) : ASU->print(dbgs(), DAG); + dbgs() << " - "; + Preds ? ASU->print(dbgs(), DAG) : BSU->print(dbgs(), DAG); + dbgs() << '\n'); + + return true; + } + + return false; +} + +void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + // For each of the SUnits in the scheduling block, try to fuse the instruction + // in it with one in its successors. + for (SUnit &ASU : DAG->SUnits) + scheduleAdjacentImpl(DAG, &ASU, ASU.Succs, false); + + // Try to fuse the instruction in the ExitSU with one in its predecessors. + scheduleAdjacentImpl(DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true); +} + +std::unique_ptr createAArch64MacroFusionDAGMutation () { + return EnableMacroFusion ? make_unique() : nullptr; +} + +} // llvm Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -14,6 +14,7 @@ #include "AArch64CallLowering.h" #include "AArch64InstructionSelector.h" #include "AArch64LegalizerInfo.h" +#include "AArch64MacroFusion.h" #include "AArch64RegisterBankInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" @@ -323,7 +324,7 @@ ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + DAG->addMutation(createAArch64MacroFusionDAGMutation()); return DAG; } Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -55,6 +55,7 @@ AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp + AArch64MacroFusion.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp AArch64PBQPRegAlloc.cpp Index: llvm/lib/Target/X86/CMakeLists.txt =================================================================== --- llvm/lib/Target/X86/CMakeLists.txt +++ llvm/lib/Target/X86/CMakeLists.txt @@ -43,6 +43,7 @@ X86EvexToVex.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp + X86MacroFusion.cpp X86OptimizeLEAs.cpp X86PadShortFunction.cpp X86RegisterInfo.cpp Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -443,9 +443,6 @@ int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; bool Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -8294,165 +8294,6 @@ return true; } -bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const { - // Check if this processor supports macro-fusion. Since this is a minor - // heuristic, we haven't specifically reserved a feature. hasAVX is a decent - // proxy for SandyBridge+. - if (!Subtarget.hasAVX()) - return false; - - enum { - FuseTest, - FuseCmp, - FuseInc - } FuseKind; - - switch (Second.getOpcode()) { - default: - return false; - case X86::JE_1: - case X86::JNE_1: - case X86::JL_1: - case X86::JLE_1: - case X86::JG_1: - case X86::JGE_1: - FuseKind = FuseInc; - break; - case X86::JB_1: - case X86::JBE_1: - case X86::JA_1: - case X86::JAE_1: - FuseKind = FuseCmp; - break; - case X86::JS_1: - case X86::JNS_1: - case X86::JP_1: - case X86::JNP_1: - case X86::JO_1: - case X86::JNO_1: - FuseKind = FuseTest; - break; - } - switch (First.getOpcode()) { - default: - return false; - case X86::TEST8rr: - case X86::TEST16rr: - case X86::TEST32rr: - case X86::TEST64rr: - case X86::TEST8ri: - case X86::TEST16ri: - case X86::TEST32ri: - case X86::TEST32i32: - case X86::TEST64i32: - case X86::TEST64ri32: - case X86::TEST8rm: - case X86::TEST16rm: - case X86::TEST32rm: - case X86::TEST64rm: - case X86::TEST8ri_NOREX: - case X86::AND16i16: - case X86::AND16ri: - case X86::AND16ri8: - case X86::AND16rm: - case X86::AND16rr: - case X86::AND32i32: - case X86::AND32ri: - case X86::AND32ri8: - case X86::AND32rm: - case X86::AND32rr: - case X86::AND64i32: - case X86::AND64ri32: - case X86::AND64ri8: - case X86::AND64rm: - case X86::AND64rr: - case X86::AND8i8: - case X86::AND8ri: - case X86::AND8rm: - case X86::AND8rr: - return true; - case X86::CMP16i16: - case X86::CMP16ri: - case X86::CMP16ri8: - case X86::CMP16rm: - case X86::CMP16rr: - case X86::CMP32i32: - case X86::CMP32ri: - case X86::CMP32ri8: - case X86::CMP32rm: - case X86::CMP32rr: - case X86::CMP64i32: - case X86::CMP64ri32: - case X86::CMP64ri8: - case X86::CMP64rm: - case X86::CMP64rr: - case X86::CMP8i8: - case X86::CMP8ri: - case X86::CMP8rm: - case X86::CMP8rr: - case X86::ADD16i16: - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD16ri8_DB: - case X86::ADD16ri_DB: - case X86::ADD16rm: - case X86::ADD16rr: - case X86::ADD16rr_DB: - case X86::ADD32i32: - case X86::ADD32ri: - case X86::ADD32ri8: - case X86::ADD32ri8_DB: - case X86::ADD32ri_DB: - case X86::ADD32rm: - case X86::ADD32rr: - case X86::ADD32rr_DB: - case X86::ADD64i32: - case X86::ADD64ri32: - case X86::ADD64ri32_DB: - case X86::ADD64ri8: - case X86::ADD64ri8_DB: - case X86::ADD64rm: - case X86::ADD64rr: - case X86::ADD64rr_DB: - case X86::ADD8i8: - case X86::ADD8mi: - case X86::ADD8mr: - case X86::ADD8ri: - case X86::ADD8rm: - case X86::ADD8rr: - case X86::SUB16i16: - case X86::SUB16ri: - case X86::SUB16ri8: - case X86::SUB16rm: - case X86::SUB16rr: - case X86::SUB32i32: - case X86::SUB32ri: - case X86::SUB32ri8: - case X86::SUB32rm: - case X86::SUB32rr: - case X86::SUB64i32: - case X86::SUB64ri32: - case X86::SUB64ri8: - case X86::SUB64rm: - case X86::SUB64rr: - case X86::SUB8i8: - case X86::SUB8ri: - case X86::SUB8rm: - case X86::SUB8rr: - return FuseKind == FuseCmp || FuseKind == FuseInc; - case X86::INC16r: - case X86::INC32r: - case X86::INC64r: - case X86::INC8r: - case X86::DEC16r: - case X86::DEC32r: - case X86::DEC64r: - case X86::DEC8r: - return FuseKind == FuseInc; - } -} - bool X86InstrInfo:: reverseBranchCondition(SmallVectorImpl &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); Index: llvm/lib/Target/X86/X86MacroFusion.h =================================================================== --- /dev/null +++ llvm/lib/Target/X86/X86MacroFusion.h @@ -0,0 +1,39 @@ +//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 definition of the DAG scheduling mutation to pair +// instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" + +//===----------------------------------------------------------------------===// +// X86MacroFusion - DAG post-processing to encourage fusion of macro ops. +//===----------------------------------------------------------------------===// + +namespace llvm { + +/// \brief Post-process the DAG to create cluster edges between instructions +/// that may be fused by the processor into a single operation. +class X86MacroFusion : public ScheduleDAGMutation { +public: + X86MacroFusion() {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +/// Note that you have to add: +/// DAG.addMutation(createX86MacroFusionDAGMutation()); +/// to X86PassConfig::createMachineScheduler() to have an effect. +std::unique_ptr +createX86MacroFusionDAGMutation(); + +} // llvm Index: llvm/lib/Target/X86/X86MacroFusion.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/X86/X86MacroFusion.cpp @@ -0,0 +1,249 @@ +//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the DAG scheduling mutation to +// pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "X86MacroFusion.h" +#include "X86Subtarget.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +static cl::opt EnableMacroFusion("x86-misched-fusion", cl::Hidden, + cl::desc("Enable scheduling for macro fusion."), cl::init(true)); + +namespace llvm { + +/// \brief Verify that the instruction pair, \param First and \param Second, +/// should be scheduled back to back. If either instruction is unspecified, +/// then verify that the other instruction may be part of a pair at all. +static bool shouldScheduleAdjacent(const X86Subtarget &ST, + const MachineInstr *First, + const MachineInstr *Second) { + // Check if this processor supports macro-fusion. Since this is a minor + // heuristic, we haven't specifically reserved a feature. hasAVX is a decent + // proxy for SandyBridge+. + if (!ST.hasAVX()) + return false; + + enum { + FuseTest, + FuseCmp, + FuseInc + } FuseKind; + + unsigned FirstOpcode = First ? + First->getOpcode() : X86::INSTRUCTION_LIST_END; + unsigned SecondOpcode = Second ? + Second->getOpcode() : X86::INSTRUCTION_LIST_END; + + switch (SecondOpcode) { + default: + return false; + case X86::JE_1: + case X86::JNE_1: + case X86::JL_1: + case X86::JLE_1: + case X86::JG_1: + case X86::JGE_1: + FuseKind = FuseInc; + break; + case X86::JB_1: + case X86::JBE_1: + case X86::JA_1: + case X86::JAE_1: + FuseKind = FuseCmp; + break; + case X86::JS_1: + case X86::JNS_1: + case X86::JP_1: + case X86::JNP_1: + case X86::JO_1: + case X86::JNO_1: + FuseKind = FuseTest; + break; + } + + switch (FirstOpcode) { + default: + return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + case X86::TEST8ri: + case X86::TEST16ri: + case X86::TEST32ri: + case X86::TEST32i32: + case X86::TEST64i32: + case X86::TEST64ri32: + case X86::TEST8rm: + case X86::TEST16rm: + case X86::TEST32rm: + case X86::TEST64rm: + case X86::TEST8ri_NOREX: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND16rm: + case X86::AND16rr: + case X86::AND32i32: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND32rm: + case X86::AND32rr: + case X86::AND64i32: + case X86::AND64ri32: + case X86::AND64ri8: + case X86::AND64rm: + case X86::AND64rr: + case X86::AND8i8: + case X86::AND8ri: + case X86::AND8rm: + case X86::AND8rr: + return true; + case X86::CMP16i16: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP16rm: + case X86::CMP16rr: + case X86::CMP32i32: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP32rm: + case X86::CMP32rr: + case X86::CMP64i32: + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP64rm: + case X86::CMP64rr: + case X86::CMP8i8: + case X86::CMP8ri: + case X86::CMP8rm: + case X86::CMP8rr: + case X86::ADD16i16: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri8_DB: + case X86::ADD16ri_DB: + case X86::ADD16rm: + case X86::ADD16rr: + case X86::ADD16rr_DB: + case X86::ADD32i32: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri8_DB: + case X86::ADD32ri_DB: + case X86::ADD32rm: + case X86::ADD32rr: + case X86::ADD32rr_DB: + case X86::ADD64i32: + case X86::ADD64ri32: + case X86::ADD64ri32_DB: + case X86::ADD64ri8: + case X86::ADD64ri8_DB: + case X86::ADD64rm: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD8i8: + case X86::ADD8mi: + case X86::ADD8mr: + case X86::ADD8ri: + case X86::ADD8rm: + case X86::ADD8rr: + case X86::SUB16i16: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB16rm: + case X86::SUB16rr: + case X86::SUB32i32: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB32rm: + case X86::SUB32rr: + case X86::SUB64i32: + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB64rm: + case X86::SUB64rr: + case X86::SUB8i8: + case X86::SUB8ri: + case X86::SUB8rm: + case X86::SUB8rr: + return FuseKind == FuseCmp || FuseKind == FuseInc; + case X86::INC16r: + case X86::INC32r: + case X86::INC64r: + case X86::INC8r: + case X86::DEC16r: + case X86::DEC32r: + case X86::DEC64r: + case X86::DEC8r: + return FuseKind == FuseInc; + case X86::INSTRUCTION_LIST_END: + return true; + } +} + +void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + const X86Subtarget &ST = DAG->MF.getSubtarget(); + + // For now, assume targets can only fuse with the branch. + SUnit &ExitSU = DAG->ExitSU; + MachineInstr *Branch = ExitSU.getInstr(); + if (!shouldScheduleAdjacent(ST, nullptr, Branch)) + return; + + for (SDep &PredDep : ExitSU.Preds) { + if (PredDep.isWeak()) + continue; + SUnit &SU = *PredDep.getSUnit(); + MachineInstr &Pred = *SU.getInstr(); + if (!shouldScheduleAdjacent(ST, &Pred, Branch)) + continue; + + // Create a single weak edge from SU to ExitSU. The only effect is to cause + // bottom-up scheduling to heavily prioritize the clustered SU. There is no + // need to copy predecessor edges from ExitSU to SU, since top-down + // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling + // of SU, we could create an artificial edge from the deepest root, but it + // hasn't been needed yet. + bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); + (void)Success; + assert(Success && "No DAG nodes should be reachable from ExitSU"); + + // Adjust latency of data deps between the nodes. + for (SDep &PredDep : ExitSU.Preds) + if (PredDep.getSUnit() == &SU) + PredDep.setLatency(0); + for (SDep &SuccDep : SU.Succs) + if (SuccDep.getSUnit() == &ExitSU) + SuccDep.setLatency(0); + + DEBUG(dbgs() << "Macro fuse "; + SU.print(dbgs(), DAG); + dbgs() << " - ExitSU" << '\n'); + + break; + } +} + +std::unique_ptr +createX86MacroFusionDAGMutation () { + return EnableMacroFusion ? make_unique() : nullptr; +} + +} // llvm Index: llvm/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetMachine.cpp +++ llvm/lib/Target/X86/X86TargetMachine.cpp @@ -14,6 +14,7 @@ #include "X86TargetMachine.h" #include "X86.h" #include "X86CallLowering.h" +#include "X86MacroFusion.h" #include "X86TargetObjectFile.h" #include "X86TargetTransformInfo.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" @@ -289,7 +290,7 @@ ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { ScheduleDAGMILive *DAG = createGenericSchedLive(C); - DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + DAG->addMutation(createX86MacroFusionDAGMutation()); return DAG; } Index: llvm/test/CodeGen/AArch64/misched-fusion.ll =================================================================== --- llvm/test/CodeGen/AArch64/misched-fusion.ll +++ llvm/test/CodeGen/AArch64/misched-fusion.ll @@ -1,22 +1,14 @@ ; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s -target triple = "arm64-apple-ios" +target triple = "aarch64-unknown" declare void @foobar(i32 %v0, i32 %v1) ; Make sure sub is scheduled in front of cbnz ; CHECK-LABEL: test_sub_cbz: -; CHECK: add w[[ADDRES:[0-9]+]], w1, #7 ; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13 -; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]] -; CHECK: mov [[REGTY:[x,w]]]0, [[REGTY]][[ADDRES]] -; CHECK: mov [[REGTY]]1, [[REGTY]][[SUBRES]] -; CHECK: bl _foobar -; CHECK: [[SKIPBLOCK]]: -; CHECK: mov [[REGTY]]0, [[REGTY]][[SUBRES]] -; CHECK: mov [[REGTY]]1, [[REGTY]][[ADDRES]] -; CHECK: bl _foobar +; CHECK-NEXT: cbnz w[[SUBRES]], {{.?LBB[0-9_]+}} define void @test_sub_cbz(i32 %a0, i32 %a1) { entry: ; except for the fusion opportunity the sub/add should be equal so the