Index: llvm/include/llvm/CodeGen/MachineScheduler.h =================================================================== --- llvm/include/llvm/CodeGen/MachineScheduler.h +++ llvm/include/llvm/CodeGen/MachineScheduler.h @@ -1033,9 +1033,6 @@ const TargetRegisterInfo *TRI); std::unique_ptr -createMacroFusionDAGMutation(const TargetInstrInfo *TII); - -std::unique_ptr createCopyConstrainDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI); Index: llvm/include/llvm/Target/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/Target/TargetInstrInfo.h +++ llvm/include/llvm/Target/TargetInstrInfo.h @@ -1070,15 +1070,6 @@ llvm_unreachable("target did not implement shouldClusterMemOps()"); } - /// Can this target fuse the given instructions if they are scheduled - /// adjacent. Note that you have to add: - /// DAG.addMutation(createMacroFusionDAGMutation()); - /// to TargetPassConfig::createMachineScheduler() to have an effect. - virtual bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const { - llvm_unreachable("target did not implement shouldScheduleAdjacent()"); - } - /// Reverses the branch condition of the specified condition list, /// returning false on success and true if it cannot be reversed. virtual Index: llvm/lib/CodeGen/MachineScheduler.cpp =================================================================== --- llvm/lib/CodeGen/MachineScheduler.cpp +++ llvm/lib/CodeGen/MachineScheduler.cpp @@ -80,10 +80,6 @@ cl::desc("Enable memop clustering."), cl::init(true)); -// Experimental heuristics -static cl::opt EnableMacroFusion("misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - static cl::opt VerifyScheduling("verify-misched", cl::Hidden, cl::desc("Verify machine instrs before and after machine scheduling")); @@ -1543,76 +1539,6 @@ } //===----------------------------------------------------------------------===// -// MacroFusion - DAG post-processing to encourage fusion of macro ops. -//===----------------------------------------------------------------------===// - -namespace { -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. -class MacroFusion : public ScheduleDAGMutation { - const TargetInstrInfo &TII; -public: - MacroFusion(const TargetInstrInfo &TII) - : TII(TII) {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; -} // anonymous - -namespace llvm { - -std::unique_ptr -createMacroFusionDAGMutation(const TargetInstrInfo *TII) { - return EnableMacroFusion ? make_unique(*TII) : nullptr; -} - -} // namespace llvm - -/// \brief Callback from DAG postProcessing to create cluster edges to encourage -/// fused operations. -void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast(DAGInstrs); - - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch) - return; - - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) - continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!TII.shouldScheduleAdjacent(Pred, *Branch)) - continue; - - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - } - for (SDep &SuccDep : SU.Succs) { - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - } - - DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); - break; - } -} - -//===----------------------------------------------------------------------===// // CopyConstrain - DAG post-processing to encourage copy elimination. //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -26,6 +26,7 @@ class AArch64Subtarget; class AArch64TargetMachine; +class ScheduleDAGInstrs; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; @@ -136,8 +137,11 @@ bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; + /// Attempt to fuse instructions in the given scheduling block. + /// Note that you have to add: + /// DAG.addMutation(createMacroFusionDAGMutation()); + /// to TargetPassConfig::createMachineScheduler() to have an effect. + void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const; MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" @@ -1903,13 +1904,23 @@ return Offset1 + 1 == Offset2; } -bool AArch64InstrInfo::shouldScheduleAdjacent( - const MachineInstr &First, const MachineInstr &Second) const { - if (Subtarget.hasArithmeticBccFusion()) { +/// \brief Verify that the instruction pair, \param First and \param Second, +/// should be scheduled back to back. Given an anchor instruction, if the other +/// instruction is unspecified, then verify that the anchor instruction may be +/// part of a pair at all. +static bool shouldScheduleAdjacent(const AArch64InstrInfo &TII, + const AArch64Subtarget &ST, + const MachineInstr *First, + const MachineInstr *Second) { + unsigned FirstOpcode = First ? + First->getOpcode() : AArch64::INSTRUCTION_LIST_END; + unsigned SecondOpcode = Second ? + Second->getOpcode() : AArch64::INSTRUCTION_LIST_END; + + if (ST.hasArithmeticBccFusion()) // Fuse CMN, CMP, TST followed by Bcc. - unsigned SecondOpcode = Second.getOpcode(); - if (SecondOpcode == AArch64::Bcc) { - switch (First.getOpcode()) { + if (SecondOpcode == AArch64::Bcc) + switch (FirstOpcode) { default: return false; case AArch64::ADDSWri: @@ -1936,16 +1947,16 @@ case AArch64::BICSWrs: case AArch64::BICSXrs: // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); + return !TII.hasShiftedReg(*First); + case AArch64::INSTRUCTION_LIST_END: + return true; } - } - } - if (Subtarget.hasArithmeticCbzFusion()) { + + if (ST.hasArithmeticCbzFusion()) // Fuse ALU operations followed by CBZ/CBNZ. - unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || - SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { - switch (First.getOpcode()) { + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) + switch (FirstOpcode) { default: return false; case AArch64::ADDWri: @@ -1978,13 +1989,80 @@ case AArch64::BICWrs: case AArch64::BICXrs: // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); + return !TII.hasShiftedReg(*First); + case AArch64::INSTRUCTION_LIST_END: + return true; } - } + + return false; +} + +/// \brief Implement the fusion of instruction pairs in the scheduling +/// \param DAG, anchored at the instruction in \param ASU. \param Preds +/// indicates if its dependencies in \param APreds are predecessors instead of +/// successors. +static bool scheduleAdjacentImpl(const AArch64InstrInfo &TII, + const AArch64Subtarget &ST, + ScheduleDAGMI *DAG, SUnit *ASU, + SmallVectorImpl &APreds, bool Preds) { + const MachineInstr *AMI = ASU->getInstr(); + if (!AMI || AMI->isPseudo() || AMI->isTransient() || + (Preds && !shouldScheduleAdjacent(TII, ST, nullptr, AMI)) || + (!Preds && !shouldScheduleAdjacent(TII, ST, AMI, nullptr))) + return false; + + for (SDep &BDep : APreds) { + if (BDep.isWeak()) + continue; + + SUnit *BSU = BDep.getSUnit(); + const MachineInstr *BMI = BSU->getInstr(); + if (!BMI || BMI->isPseudo() || BMI->isTransient() || + (Preds && !shouldScheduleAdjacent(TII, ST, BMI, AMI)) || + (!Preds && !shouldScheduleAdjacent(TII, ST, AMI, BMI))) + continue; + + // Create a single weak edge between the adjacent instrs. The only + // effect is to cause bottom-up scheduling to heavily prioritize the + // clustered instrs. + if (Preds) + DAG->addEdge(ASU, SDep(BSU, SDep::Cluster)); + else + DAG->addEdge(BSU, SDep(ASU, SDep::Cluster)); + + // Adjust the latency between the 1st instr and its predecessors/successors. + for (SDep &Dep : APreds) + if (Dep.getSUnit() == BSU) + Dep.setLatency(0); + + // Adjust the latency between the 2nd instr and its successors/predecessors. + auto &BSuccs = Preds ? BSU->Succs : BSU->Preds; + for (SDep &Dep : BSuccs) + if (Dep.getSUnit() == ASU) + Dep.setLatency(0); + + return true; } + return false; } +/// \brief Callback from DAG postProcessing to create cluster edges to encourage +/// fused operations. +void +AArch64InstrInfo::scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + // For each of the SUnits in the scheduling block, try to fuse the instruction + // in it with one in its successors. + for (SUnit &ASU : DAG->SUnits) + scheduleAdjacentImpl(*this, Subtarget, DAG, &ASU, ASU.Succs, false); + + // Try to fuse the instruction in the ExitSU with one in its predecessors. + scheduleAdjacentImpl(*this, Subtarget, + DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true); +} + MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, const MDNode *Expr, const DebugLoc &DL) const { Index: llvm/lib/Target/AArch64/AArch64MacroFusion.h =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64MacroFusion.h @@ -0,0 +1,40 @@ +//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 definition of the DAG scheduling mutation +// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" + +//===----------------------------------------------------------------------===// +// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops. +//===----------------------------------------------------------------------===// + +namespace llvm { + +/// \brief Post-process the DAG to create cluster edges between instructions +/// that may be fused by the processor into a single operation. +class AArch64MacroFusion : public ScheduleDAGMutation { + const AArch64InstrInfo &TII; +public: + AArch64MacroFusion(const AArch64InstrInfo &TII) + : TII(TII) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + TII.scheduleAdjacent(DAGInstrs); + } +}; + +std::unique_ptr +createMacroFusionDAGMutation(const AArch64InstrInfo *TII); + +} // llvm Index: llvm/lib/Target/AArch64/AArch64MacroFusion.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -0,0 +1,31 @@ +//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the DAG scheduling mutation +// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MacroFusion.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +static cl::opt EnableMacroFusion("aarch64-misched-fusion", cl::Hidden, + cl::desc("Enable scheduling for macro fusion."), cl::init(true)); + +namespace llvm { + +std::unique_ptr +createMacroFusionDAGMutation (const AArch64InstrInfo *TII) { + return EnableMacroFusion ? make_unique(*TII) : nullptr; +} + +} // llvm Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -14,6 +14,7 @@ #include "AArch64CallLowering.h" #include "AArch64InstructionSelector.h" #include "AArch64LegalizerInfo.h" +#include "AArch64MacroFusion.h" #include "AArch64RegisterBankInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" @@ -323,7 +324,9 @@ ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + const AArch64InstrInfo *AII = + static_cast(DAG->TII); + DAG->addMutation(createMacroFusionDAGMutation(AII)); return DAG; } Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -55,6 +55,7 @@ AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp + AArch64MacroFusion.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp AArch64PBQPRegAlloc.cpp Index: llvm/lib/Target/X86/CMakeLists.txt =================================================================== --- llvm/lib/Target/X86/CMakeLists.txt +++ llvm/lib/Target/X86/CMakeLists.txt @@ -43,6 +43,7 @@ X86EvexToVex.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp + X86MacroFusion.cpp X86OptimizeLEAs.cpp X86PadShortFunction.cpp X86RegisterInfo.cpp Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -25,6 +25,7 @@ namespace llvm { class MachineInstrBuilder; + class ScheduleDAGInstrs; class X86RegisterInfo; class X86Subtarget; @@ -443,8 +444,11 @@ int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; + /// Attempt to fuse instructions in the given scheduling block. + /// Note that you have to add: + /// DAG.addMutation(createMacroFusionDAGMutation()); + /// to TargetPassConfig::createMachineScheduler() to have an effect. + void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const; void getNoopForMachoTarget(MCInst &NopInst) const override; Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -15,6 +15,7 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" +#include "X86MacroFusion.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/STLExtras.h" @@ -8294,8 +8295,12 @@ return true; } -bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const { +/// \brief Verify that the instruction pair, \param First and \param Second, +/// should be scheduled back to back. If either instruction is unspecified, +/// then verify that the other instruction may be part of a pair at all. +static bool shouldScheduleAdjacent(const X86Subtarget &Subtarget, + const MachineInstr *First, + const MachineInstr *Second) { // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. @@ -8308,7 +8313,12 @@ FuseInc } FuseKind; - switch (Second.getOpcode()) { + unsigned FirstOpcode = First ? + First->getOpcode() : X86::INSTRUCTION_LIST_END; + unsigned SecondOpcode = Second ? + Second->getOpcode() : X86::INSTRUCTION_LIST_END; + + switch (SecondOpcode) { default: return false; case X86::JE_1: @@ -8334,7 +8344,8 @@ FuseKind = FuseTest; break; } - switch (First.getOpcode()) { + + switch (FirstOpcode) { default: return false; case X86::TEST8rr: @@ -8450,6 +8461,49 @@ case X86::DEC64r: case X86::DEC8r: return FuseKind == FuseInc; + case X86::INSTRUCTION_LIST_END: + return true; + } +} + +void X86InstrInfo::scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); + + // For now, assume targets can only fuse with the branch. + SUnit &ExitSU = DAG->ExitSU; + MachineInstr *Branch = ExitSU.getInstr(); + if (!shouldScheduleAdjacent(Subtarget, nullptr, Branch)) + return; + + for (SDep &PredDep : ExitSU.Preds) { + if (PredDep.isWeak()) + continue; + SUnit &SU = *PredDep.getSUnit(); + MachineInstr &Pred = *SU.getInstr(); + if (!shouldScheduleAdjacent(Subtarget, &Pred, Branch)) + continue; + + // Create a single weak edge from SU to ExitSU. The only effect is to cause + // bottom-up scheduling to heavily prioritize the clustered SU. There is no + // need to copy predecessor edges from ExitSU to SU, since top-down + // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling + // of SU, we could create an artificial edge from the deepest root, but it + // hasn't been needed yet. + bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); + (void)Success; + assert(Success && "No DAG nodes should be reachable from ExitSU"); + + // Adjust latency of data deps between the nodes. + for (SDep &PredDep : ExitSU.Preds) { + if (PredDep.getSUnit() == &SU) + PredDep.setLatency(0); + } + for (SDep &SuccDep : SU.Succs) { + if (SuccDep.getSUnit() == &ExitSU) + SuccDep.setLatency(0); + } + + break; } } Index: llvm/lib/Target/X86/X86MacroFusion.h =================================================================== --- /dev/null +++ llvm/lib/Target/X86/X86MacroFusion.h @@ -0,0 +1,40 @@ +//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 definition of the DAG scheduling mutation to pair +// instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" + +//===----------------------------------------------------------------------===// +// X86MacroFusion - DAG post-processing to encourage fusion of macro ops. +//===----------------------------------------------------------------------===// + +namespace llvm { + +/// \brief Post-process the DAG to create cluster edges between instructions +/// that may be fused by the processor into a single operation. +class X86MacroFusion : public ScheduleDAGMutation { + const X86InstrInfo &TII; +public: + X86MacroFusion(const X86InstrInfo &TII) + : TII(TII) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + TII.scheduleAdjacent(DAGInstrs); + } +}; + +std::unique_ptr +createMacroFusionDAGMutation(const X86InstrInfo *TII); + +} // llvm Index: llvm/lib/Target/X86/X86MacroFusion.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/X86/X86MacroFusion.cpp @@ -0,0 +1,31 @@ +//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the DAG scheduling mutation to +// pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "X86MacroFusion.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +static cl::opt EnableMacroFusion("x86-misched-fusion", cl::Hidden, + cl::desc("Enable scheduling for macro fusion."), cl::init(true)); + +namespace llvm { + +std::unique_ptr +createMacroFusionDAGMutation (const X86InstrInfo *TII) { + return EnableMacroFusion ? make_unique(*TII) : nullptr; +} + +} // llvm Index: llvm/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetMachine.cpp +++ llvm/lib/Target/X86/X86TargetMachine.cpp @@ -14,6 +14,7 @@ #include "X86TargetMachine.h" #include "X86.h" #include "X86CallLowering.h" +#include "X86MacroFusion.h" #include "X86TargetObjectFile.h" #include "X86TargetTransformInfo.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" @@ -289,7 +290,8 @@ ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { ScheduleDAGMILive *DAG = createGenericSchedLive(C); - DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + const X86InstrInfo *AII = static_cast(DAG->TII); + DAG->addMutation(createMacroFusionDAGMutation(AII)); return DAG; } Index: llvm/test/CodeGen/AArch64/misched-fusion.ll =================================================================== --- llvm/test/CodeGen/AArch64/misched-fusion.ll +++ llvm/test/CodeGen/AArch64/misched-fusion.ll @@ -1,22 +1,14 @@ ; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s -target triple = "arm64-apple-ios" +target triple = "aarch64-unknown" declare void @foobar(i32 %v0, i32 %v1) ; Make sure sub is scheduled in front of cbnz ; CHECK-LABEL: test_sub_cbz: -; CHECK: add w[[ADDRES:[0-9]+]], w1, #7 ; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13 -; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]] -; CHECK: mov [[REGTY:[x,w]]]0, [[REGTY]][[ADDRES]] -; CHECK: mov [[REGTY]]1, [[REGTY]][[SUBRES]] -; CHECK: bl _foobar -; CHECK: [[SKIPBLOCK]]: -; CHECK: mov [[REGTY]]0, [[REGTY]][[SUBRES]] -; CHECK: mov [[REGTY]]1, [[REGTY]][[ADDRES]] -; CHECK: bl _foobar +; CHECK-NEXT: cbnz w[[SUBRES]], {{.?LBB[0-9_]+}} define void @test_sub_cbz(i32 %a0, i32 %a1) { entry: ; except for the fusion opportunity the sub/add should be equal so the