Index: llvm/include/llvm/CodeGen/MachineScheduler.h
===================================================================
--- llvm/include/llvm/CodeGen/MachineScheduler.h
+++ llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1033,9 +1033,6 @@
                               const TargetRegisterInfo *TRI);
 
 std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(const TargetInstrInfo *TII);
-
-std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
                                const TargetRegisterInfo *TRI);
 
Index: llvm/include/llvm/Target/TargetInstrInfo.h
===================================================================
--- llvm/include/llvm/Target/TargetInstrInfo.h
+++ llvm/include/llvm/Target/TargetInstrInfo.h
@@ -1070,15 +1070,6 @@
     llvm_unreachable("target did not implement shouldClusterMemOps()");
   }
 
-  /// Can this target fuse the given instructions if they are scheduled
-  /// adjacent. Note that you have to add:
-  ///   DAG.addMutation(createMacroFusionDAGMutation());
-  /// to TargetPassConfig::createMachineScheduler() to have an effect.
-  virtual bool shouldScheduleAdjacent(const MachineInstr &First,
-                                      const MachineInstr &Second) const {
-    llvm_unreachable("target did not implement shouldScheduleAdjacent()");
-  }
-
   /// Reverses the branch condition of the specified condition list,
   /// returning false on success and true if it cannot be reversed.
   virtual
Index: llvm/lib/CodeGen/MachineScheduler.cpp
===================================================================
--- llvm/lib/CodeGen/MachineScheduler.cpp
+++ llvm/lib/CodeGen/MachineScheduler.cpp
@@ -80,10 +80,6 @@
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
 
-// Experimental heuristics
-static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
 static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
   cl::desc("Verify machine instrs before and after machine scheduling"));
 
@@ -1543,76 +1539,6 @@
 }
 
 //===----------------------------------------------------------------------===//
-// MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class MacroFusion : public ScheduleDAGMutation {
-  const TargetInstrInfo &TII;
-public:
-  MacroFusion(const TargetInstrInfo &TII)
-    : TII(TII) {}
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-} // anonymous
-
-namespace llvm {
-
-std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(const TargetInstrInfo *TII) {
-  return EnableMacroFusion ? make_unique<MacroFusion>(*TII) : nullptr;
-}
-
-} // namespace llvm
-
-/// \brief Callback from DAG postProcessing to create cluster edges to encourage
-/// fused operations.
-void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
-  // For now, assume targets can only fuse with the branch.
-  SUnit &ExitSU = DAG->ExitSU;
-  MachineInstr *Branch = ExitSU.getInstr();
-  if (!Branch)
-    return;
-
-  for (SDep &PredDep : ExitSU.Preds) {
-    if (PredDep.isWeak())
-      continue;
-    SUnit &SU = *PredDep.getSUnit();
-    MachineInstr &Pred = *SU.getInstr();
-    if (!TII.shouldScheduleAdjacent(Pred, *Branch))
-      continue;
-
-    // Create a single weak edge from SU to ExitSU. The only effect is to cause
-    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
-    // need to copy predecessor edges from ExitSU to SU, since top-down
-    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
-    // of SU, we could create an artificial edge from the deepest root, but it
-    // hasn't been needed yet.
-    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
-    (void)Success;
-    assert(Success && "No DAG nodes should be reachable from ExitSU");
-
-    // Adjust latency of data deps between the nodes.
-    for (SDep &PredDep : ExitSU.Preds) {
-      if (PredDep.getSUnit() == &SU)
-        PredDep.setLatency(0);
-    }
-    for (SDep &SuccDep : SU.Succs) {
-      if (SuccDep.getSUnit() == &ExitSU)
-        SuccDep.setLatency(0);
-    }
-
-    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
-    break;
-  }
-}
-
-//===----------------------------------------------------------------------===//
 // CopyConstrain - DAG post-processing to encourage copy elimination.
 //===----------------------------------------------------------------------===//
 
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -26,6 +26,7 @@
 
 class AArch64Subtarget;
 class AArch64TargetMachine;
+class ScheduleDAGInstrs;
 
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
@@ -136,8 +137,11 @@
   bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
                            unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
+  /// Attempt to fuse instructions in the given scheduling block.
+  /// Note that you have to add:
+  ///   DAG.addMutation(createMacroFusionDAGMutation());
+  /// to TargetPassConfig::createMachineScheduler() to have an effect.
+  void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const;
 
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                          uint64_t Offset, const MDNode *Var,
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
@@ -1903,13 +1904,23 @@
   return Offset1 + 1 == Offset2;
 }
 
-bool AArch64InstrInfo::shouldScheduleAdjacent(
-    const MachineInstr &First, const MachineInstr &Second) const {
-  if (Subtarget.hasArithmeticBccFusion()) {
+/// \brief Verify that the instruction pair, \param First and \param Second,
+/// should be scheduled back to back.  Given an anchor instruction, if the other
+/// instruction is unspecified, then verify that the anchor instruction may be
+/// part of a pair at all.
+static bool shouldScheduleAdjacent(const AArch64InstrInfo &TII,
+                                   const AArch64Subtarget &ST,
+                                   const MachineInstr *First,
+                                   const MachineInstr *Second) {
+  unsigned FirstOpcode = First ?
+                         First->getOpcode() : AArch64::INSTRUCTION_LIST_END;
+  unsigned SecondOpcode = Second ?
+                          Second->getOpcode() : AArch64::INSTRUCTION_LIST_END;
+
+  if (ST.hasArithmeticBccFusion())
     // Fuse CMN, CMP, TST followed by Bcc.
-    unsigned SecondOpcode = Second.getOpcode();
-    if (SecondOpcode == AArch64::Bcc) {
-      switch (First.getOpcode()) {
+    if (SecondOpcode == AArch64::Bcc)
+      switch (FirstOpcode) {
       default:
         return false;
       case AArch64::ADDSWri:
@@ -1936,16 +1947,16 @@
       case AArch64::BICSWrs:
       case AArch64::BICSXrs:
         // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
+        return !TII.hasShiftedReg(*First);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
       }
-    }
-  }
-  if (Subtarget.hasArithmeticCbzFusion()) {
+
+  if (ST.hasArithmeticCbzFusion())
     // Fuse ALU operations followed by CBZ/CBNZ.
-    unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
-        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
-      switch (First.getOpcode()) {
+        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
+      switch (FirstOpcode) {
       default:
         return false;
       case AArch64::ADDWri:
@@ -1978,13 +1989,80 @@
       case AArch64::BICWrs:
       case AArch64::BICXrs:
         // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
+        return !TII.hasShiftedReg(*First);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
       }
-    }
+
+  return false;
+}
+
+/// \brief Implement the fusion of instruction pairs in the scheduling
+/// \param DAG, anchored at the instruction in \param ASU. \param Preds
+/// indicates if its dependencies in \param APreds are predecessors instead of
+/// successors.
+static bool scheduleAdjacentImpl(const AArch64InstrInfo &TII,
+                                 const AArch64Subtarget &ST,
+                                 ScheduleDAGMI *DAG, SUnit *ASU,
+                                 SmallVectorImpl<SDep> &APreds, bool Preds) {
+  const MachineInstr *AMI = ASU->getInstr();
+  if (!AMI || AMI->isPseudo() || AMI->isTransient() ||
+      (Preds && !shouldScheduleAdjacent(TII, ST, nullptr, AMI)) ||
+      (!Preds && !shouldScheduleAdjacent(TII, ST, AMI, nullptr)))
+    return false;
+
+  for (SDep &BDep : APreds) {
+    if (BDep.isWeak())
+      continue;
+
+    SUnit *BSU = BDep.getSUnit();
+    const MachineInstr *BMI = BSU->getInstr();
+    if (!BMI || BMI->isPseudo() || BMI->isTransient() ||
+        (Preds && !shouldScheduleAdjacent(TII, ST, BMI, AMI)) ||
+        (!Preds && !shouldScheduleAdjacent(TII, ST, AMI, BMI)))
+      continue;
+
+    // Create a single weak edge between the adjacent instrs. The only
+    // effect is to cause bottom-up scheduling to heavily prioritize the
+    // clustered instrs.
+    if (Preds)
+      DAG->addEdge(ASU, SDep(BSU, SDep::Cluster));
+    else
+      DAG->addEdge(BSU, SDep(ASU, SDep::Cluster));
+
+    // Adjust the latency between the 1st instr and its predecessors/successors.
+    for (SDep &Dep : APreds)
+      if (Dep.getSUnit() == BSU)
+        Dep.setLatency(0);
+
+    // Adjust the latency between the 2nd instr and its successors/predecessors.
+    auto &BSuccs = Preds ? BSU->Succs : BSU->Preds;
+    for (SDep &Dep : BSuccs)
+      if (Dep.getSUnit() == ASU)
+        Dep.setLatency(0);
+
+    return true;
   }
+
   return false;
 }
 
+/// \brief Callback from DAG postProcessing to create cluster edges to encourage
+/// fused operations.
+void
+AArch64InstrInfo::scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  // For each of the SUnits in the scheduling block, try to fuse the instruction
+  // in it with one in its successors.
+  for (SUnit &ASU : DAG->SUnits)
+    scheduleAdjacentImpl(*this, Subtarget, DAG, &ASU, ASU.Succs, false);
+
+  // Try to fuse the instruction in the ExitSU with one in its predecessors.
+  scheduleAdjacentImpl(*this, Subtarget,
+                       DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true);
+}
+
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
     MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
     const MDNode *Expr, const DebugLoc &DL) const {
Index: llvm/lib/Target/AArch64/AArch64MacroFusion.h
===================================================================
--- /dev/null
+++ llvm/lib/Target/AArch64/AArch64MacroFusion.h
@@ -0,0 +1,40 @@
+//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 definition of the DAG scheduling mutation
+// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+//===----------------------------------------------------------------------===//
+// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class AArch64MacroFusion : public ScheduleDAGMutation {
+  const AArch64InstrInfo &TII;
+public:
+  AArch64MacroFusion(const AArch64InstrInfo &TII)
+    : TII(TII) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override {
+    TII.scheduleAdjacent(DAGInstrs);
+  }
+};
+
+std::unique_ptr<ScheduleDAGMutation>
+createMacroFusionDAGMutation(const AArch64InstrInfo *TII);
+
+} // llvm
Index: llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -0,0 +1,31 @@
+//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the DAG scheduling mutation
+// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MacroFusion.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createMacroFusionDAGMutation (const AArch64InstrInfo *TII) {
+  return EnableMacroFusion ? make_unique<AArch64MacroFusion>(*TII) : nullptr;
+}
+
+} // llvm
Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "AArch64CallLowering.h"
 #include "AArch64InstructionSelector.h"
 #include "AArch64LegalizerInfo.h"
+#include "AArch64MacroFusion.h"
 #include "AArch64RegisterBankInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
@@ -323,7 +324,9 @@
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    const AArch64InstrInfo *AII =
+      static_cast<const AArch64InstrInfo*>(DAG->TII);
+    DAG->addMutation(createMacroFusionDAGMutation(AII));
     return DAG;
   }
 
Index: llvm/lib/Target/AArch64/CMakeLists.txt
===================================================================
--- llvm/lib/Target/AArch64/CMakeLists.txt
+++ llvm/lib/Target/AArch64/CMakeLists.txt
@@ -55,6 +55,7 @@
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
   AArch64LoadStoreOptimizer.cpp
+  AArch64MacroFusion.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
   AArch64PBQPRegAlloc.cpp
Index: llvm/lib/Target/X86/CMakeLists.txt
===================================================================
--- llvm/lib/Target/X86/CMakeLists.txt
+++ llvm/lib/Target/X86/CMakeLists.txt
@@ -43,6 +43,7 @@
   X86EvexToVex.cpp
   X86MCInstLower.cpp
   X86MachineFunctionInfo.cpp
+  X86MacroFusion.cpp
   X86OptimizeLEAs.cpp
   X86PadShortFunction.cpp
   X86RegisterInfo.cpp
Index: llvm/lib/Target/X86/X86InstrInfo.h
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.h
+++ llvm/lib/Target/X86/X86InstrInfo.h
@@ -25,6 +25,7 @@
 
 namespace llvm {
   class MachineInstrBuilder;
+  class ScheduleDAGInstrs;
   class X86RegisterInfo;
   class X86Subtarget;
 
@@ -443,8 +444,11 @@
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
+  /// Attempt to fuse instructions in the given scheduling block.
+  /// Note that you have to add:
+  ///   DAG.addMutation(createMacroFusionDAGMutation());
+  /// to TargetPassConfig::createMachineScheduler() to have an effect.
+  void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const;
 
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
Index: llvm/lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.cpp
+++ llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -15,6 +15,7 @@
 #include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86MacroFusion.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
@@ -8294,8 +8295,12 @@
   return true;
 }
 
-bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
-                                          const MachineInstr &Second) const {
+/// \brief Verify that the instruction pair, \param First and \param Second,
+/// should be scheduled back to back.  If either instruction is unspecified,
+/// then verify that the other instruction may be part of a pair at all.
+static bool shouldScheduleAdjacent(const X86Subtarget &Subtarget,
+                                   const MachineInstr *First,
+                                   const MachineInstr *Second) {
   // Check if this processor supports macro-fusion. Since this is a minor
   // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
   // proxy for SandyBridge+.
@@ -8308,7 +8313,12 @@
     FuseInc
   } FuseKind;
 
-  switch (Second.getOpcode()) {
+  unsigned FirstOpcode = First ?
+                         First->getOpcode() : X86::INSTRUCTION_LIST_END;
+  unsigned SecondOpcode = Second ?
+                          Second->getOpcode() : X86::INSTRUCTION_LIST_END;
+
+  switch (SecondOpcode) {
   default:
     return false;
   case X86::JE_1:
@@ -8334,7 +8344,8 @@
     FuseKind = FuseTest;
     break;
   }
-  switch (First.getOpcode()) {
+
+  switch (FirstOpcode) {
   default:
     return false;
   case X86::TEST8rr:
@@ -8450,6 +8461,49 @@
   case X86::DEC64r:
   case X86::DEC8r:
     return FuseKind == FuseInc;
+  case X86::INSTRUCTION_LIST_END:
+    return true;
+  }
+}
+
+void X86InstrInfo::scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  // For now, assume targets can only fuse with the branch.
+  SUnit &ExitSU = DAG->ExitSU;
+  MachineInstr *Branch = ExitSU.getInstr();
+  if (!shouldScheduleAdjacent(Subtarget, nullptr, Branch))
+    return;
+
+  for (SDep &PredDep : ExitSU.Preds) {
+    if (PredDep.isWeak())
+      continue;
+    SUnit &SU = *PredDep.getSUnit();
+    MachineInstr &Pred = *SU.getInstr();
+    if (!shouldScheduleAdjacent(Subtarget, &Pred, Branch))
+      continue;
+
+    // Create a single weak edge from SU to ExitSU. The only effect is to cause
+    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
+    // need to copy predecessor edges from ExitSU to SU, since top-down
+    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+    // of SU, we could create an artificial edge from the deepest root, but it
+    // hasn't been needed yet.
+    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
+    (void)Success;
+    assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+    // Adjust latency of data deps between the nodes.
+    for (SDep &PredDep : ExitSU.Preds) {
+      if (PredDep.getSUnit() == &SU)
+        PredDep.setLatency(0);
+    }
+    for (SDep &SuccDep : SU.Succs) {
+      if (SuccDep.getSUnit() == &ExitSU)
+        SuccDep.setLatency(0);
+    }
+
+    break;
   }
 }
 
Index: llvm/lib/Target/X86/X86MacroFusion.h
===================================================================
--- /dev/null
+++ llvm/lib/Target/X86/X86MacroFusion.h
@@ -0,0 +1,40 @@
+//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 definition of the DAG scheduling mutation to pair
+// instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+//===----------------------------------------------------------------------===//
+// X86MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class X86MacroFusion : public ScheduleDAGMutation {
+  const X86InstrInfo &TII;
+public:
+  X86MacroFusion(const X86InstrInfo &TII)
+    : TII(TII) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override {
+    TII.scheduleAdjacent(DAGInstrs);
+  }
+};
+
+std::unique_ptr<ScheduleDAGMutation>
+createMacroFusionDAGMutation(const X86InstrInfo *TII);
+
+} // llvm
Index: llvm/lib/Target/X86/X86MacroFusion.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -0,0 +1,31 @@
+//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the DAG scheduling mutation to
+// pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MacroFusion.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createMacroFusionDAGMutation (const X86InstrInfo *TII) {
+  return EnableMacroFusion ? make_unique<X86MacroFusion>(*TII) : nullptr;
+}
+
+} // llvm
Index: llvm/lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- llvm/lib/Target/X86/X86TargetMachine.cpp
+++ llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "X86TargetMachine.h"
 #include "X86.h"
 #include "X86CallLowering.h"
+#include "X86MacroFusion.h"
 #include "X86TargetObjectFile.h"
 #include "X86TargetTransformInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
@@ -289,7 +290,8 @@
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
-    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    const X86InstrInfo *AII = static_cast<const X86InstrInfo*>(DAG->TII);
+    DAG->addMutation(createMacroFusionDAGMutation(AII));
     return DAG;
   }
 
Index: llvm/test/CodeGen/AArch64/misched-fusion.ll
===================================================================
--- llvm/test/CodeGen/AArch64/misched-fusion.ll
+++ llvm/test/CodeGen/AArch64/misched-fusion.ll
@@ -1,22 +1,14 @@
 ; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s
 ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
 
-target triple = "arm64-apple-ios"
+target triple = "aarch64-unknown"
 
 declare void @foobar(i32 %v0, i32 %v1)
 
 ; Make sure sub is scheduled in front of cbnz
 ; CHECK-LABEL: test_sub_cbz:
-; CHECK: add w[[ADDRES:[0-9]+]], w1, #7
 ; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13
-; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]]
-; CHECK: mov [[REGTY:[x,w]]]0, [[REGTY]][[ADDRES]]
-; CHECK: mov [[REGTY]]1, [[REGTY]][[SUBRES]]
-; CHECK: bl _foobar
-; CHECK: [[SKIPBLOCK]]:
-; CHECK: mov [[REGTY]]0, [[REGTY]][[SUBRES]]
-; CHECK: mov [[REGTY]]1, [[REGTY]][[ADDRES]]
-; CHECK: bl _foobar
+; CHECK-NEXT: cbnz w[[SUBRES]], {{.?LBB[0-9_]+}}
 define void @test_sub_cbz(i32 %a0, i32 %a1) {
 entry:
   ; except for the fusion opportunity the sub/add should be equal so the