Index: llvm/include/llvm/Target/TargetInstrInfo.h
===================================================================
--- llvm/include/llvm/Target/TargetInstrInfo.h
+++ llvm/include/llvm/Target/TargetInstrInfo.h
@@ -39,6 +39,7 @@
 class ScheduleHazardRecognizer;
 class SelectionDAG;
 class ScheduleDAG;
+class ScheduleDAGInstrs;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 class TargetSubtargetInfo;
@@ -1070,13 +1071,12 @@
     llvm_unreachable("target did not implement shouldClusterMemOps()");
   }
 
-  /// Can this target fuse the given instructions if they are scheduled
-  /// adjacent. Note that you have to add:
+  /// Override the fusion of instructions in the given scheduling block.
+  /// Note that you have to add:
   ///   DAG.addMutation(createMacroFusionDAGMutation());
   /// to TargetPassConfig::createMachineScheduler() to have an effect.
-  virtual bool shouldScheduleAdjacent(const MachineInstr &First,
-                                      const MachineInstr &Second) const {
-    llvm_unreachable("target did not implement shouldScheduleAdjacent()");
+  virtual void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const {
+    llvm_unreachable("target did not implement scheduleAdjacent()");
   }
 
   /// Reverses the branch condition of the specified condition list,
Index: llvm/lib/CodeGen/MachineScheduler.cpp
===================================================================
--- llvm/lib/CodeGen/MachineScheduler.cpp
+++ llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1555,7 +1555,9 @@
   MacroFusion(const TargetInstrInfo &TII)
     : TII(TII) {}
 
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+  void apply(ScheduleDAGInstrs *DAGInstrs) override {
+    TII.scheduleAdjacent(DAGInstrs);
+  }
 };
 } // anonymous
 
@@ -1568,50 +1570,6 @@
 
 } // namespace llvm
 
-/// \brief Callback from DAG postProcessing to create cluster edges to encourage
-/// fused operations.
-void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
-  // For now, assume targets can only fuse with the branch.
-  SUnit &ExitSU = DAG->ExitSU;
-  MachineInstr *Branch = ExitSU.getInstr();
-  if (!Branch)
-    return;
-
-  for (SDep &PredDep : ExitSU.Preds) {
-    if (PredDep.isWeak())
-      continue;
-    SUnit &SU = *PredDep.getSUnit();
-    MachineInstr &Pred = *SU.getInstr();
-    if (!TII.shouldScheduleAdjacent(Pred, *Branch))
-      continue;
-
-    // Create a single weak edge from SU to ExitSU. The only effect is to cause
-    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
-    // need to copy predecessor edges from ExitSU to SU, since top-down
-    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
-    // of SU, we could create an artificial edge from the deepest root, but it
-    // hasn't been needed yet.
-    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
-    (void)Success;
-    assert(Success && "No DAG nodes should be reachable from ExitSU");
-
-    // Adjust latency of data deps between the nodes.
-    for (SDep &PredDep : ExitSU.Preds) {
-      if (PredDep.getSUnit() == &SU)
-        PredDep.setLatency(0);
-    }
-    for (SDep &SuccDep : SU.Succs) {
-      if (SuccDep.getSUnit() == &ExitSU)
-        SuccDep.setLatency(0);
-    }
-
-    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
-    break;
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // CopyConstrain - DAG post-processing to encourage copy elimination.
 //===----------------------------------------------------------------------===//
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -136,8 +136,7 @@
   bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
                            unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
+  void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const override;
 
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                          uint64_t Offset, const MDNode *Var,
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/GlobalValue.h"
@@ -1903,13 +1904,23 @@
   return Offset1 + 1 == Offset2;
 }
 
-bool AArch64InstrInfo::shouldScheduleAdjacent(
-    const MachineInstr &First, const MachineInstr &Second) const {
-  if (Subtarget.hasArithmeticBccFusion()) {
+/// \brief Verify that the instruction pair, \param First and \param Second,
+/// should be scheduled back to back.  Given an anchor instruction, if the other
+/// instruction is unspecified, then verify that the anchor instruction may be
+/// part of a pair at all.
+static bool shouldScheduleAdjacent(const AArch64InstrInfo &TII,
+                                   const AArch64Subtarget &ST,
+                                   const MachineInstr *First,
+                                   const MachineInstr *Second) {
+  unsigned FirstOpcode = First ?
+                         First->getOpcode() : AArch64::INSTRUCTION_LIST_END;
+  unsigned SecondOpcode = Second ?
+                          Second->getOpcode() : AArch64::INSTRUCTION_LIST_END;
+
+  if (ST.hasArithmeticBccFusion())
     // Fuse CMN, CMP, TST followed by Bcc.
-    unsigned SecondOpcode = Second.getOpcode();
-    if (SecondOpcode == AArch64::Bcc) {
-      switch (First.getOpcode()) {
+    if (SecondOpcode == AArch64::Bcc)
+      switch (FirstOpcode) {
       default:
         return false;
       case AArch64::ADDSWri:
@@ -1936,16 +1947,16 @@
       case AArch64::BICSWrs:
       case AArch64::BICSXrs:
         // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
+        return !TII.hasShiftedReg(*First);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
       }
-    }
-  }
-  if (Subtarget.hasArithmeticCbzFusion()) {
+
+  if (ST.hasArithmeticCbzFusion())
     // Fuse ALU operations followed by CBZ/CBNZ.
-    unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
-        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
-      switch (First.getOpcode()) {
+        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
+      switch (FirstOpcode) {
       default:
         return false;
       case AArch64::ADDWri:
@@ -1978,13 +1989,80 @@
       case AArch64::BICWrs:
       case AArch64::BICXrs:
         // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
+        return !TII.hasShiftedReg(*First);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
       }
-    }
+
+  return false;
+}
+
+/// \brief Implement the fusion of instruction pairs in the scheduling
+/// \param DAG, anchored at the instruction in \param ASU. \param Preds
+/// indicates if its dependencies in \param APreds are predecessors instead of
+/// successors.
+static bool scheduleAdjacentImpl(const AArch64InstrInfo &TII,
+                                 const AArch64Subtarget &ST,
+                                 ScheduleDAGMI *DAG, SUnit *ASU,
+                                 SmallVectorImpl<SDep> &APreds, bool Preds) {
+  const MachineInstr *AMI = ASU->getInstr();
+  if (!AMI || AMI->isPseudo() || AMI->isTransient() ||
+      (Preds && !shouldScheduleAdjacent(TII, ST, nullptr, AMI)) ||
+      (!Preds && !shouldScheduleAdjacent(TII, ST, AMI, nullptr)))
+    return false;
+
+  for (SDep &BDep : APreds) {
+    if (BDep.isWeak())
+      continue;
+
+    SUnit *BSU = BDep.getSUnit();
+    const MachineInstr *BMI = BSU->getInstr();
+    if (!BMI || BMI->isPseudo() || BMI->isTransient() ||
+        (Preds && !shouldScheduleAdjacent(TII, ST, BMI, AMI)) ||
+        (!Preds && !shouldScheduleAdjacent(TII, ST, AMI, BMI)))
+      continue;
+
+    // Create a single weak edge between the adjacent instrs. The only
+    // effect is to cause bottom-up scheduling to heavily prioritize the
+    // clustered instrs.
+    if (Preds)
+      DAG->addEdge(ASU, SDep(BSU, SDep::Cluster));
+    else
+      DAG->addEdge(BSU, SDep(ASU, SDep::Cluster));
+
+    // Adjust the latency between the 1st instr and its predecessors/successors.
+    for (SDep &Dep : APreds)
+      if (Dep.getSUnit() == BSU)
+        Dep.setLatency(0);
+
+    // Adjust the latency between the 2nd instr and its successors/predecessors.
+    auto &BSuccs = Preds ? BSU->Succs : BSU->Preds;
+    for (SDep &Dep : BSuccs)
+      if (Dep.getSUnit() == ASU)
+        Dep.setLatency(0);
+
+    return true;
   }
+
   return false;
 }
 
+/// \brief Callback from DAG postProcessing to create cluster edges to encourage
+/// fused operations.
+void
+AArch64InstrInfo::scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  // For each of the SUnits in the scheduling block, try to fuse the instruction
+  // in it with one in its successors.
+  for (SUnit &ASU : DAG->SUnits)
+    scheduleAdjacentImpl(*this, Subtarget, DAG, &ASU, ASU.Succs, false);
+
+  // Try to fuse the instruction in the ExitSU with one in its predecessors.
+  scheduleAdjacentImpl(*this, Subtarget,
+                       DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true);
+}
+
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
     MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
     const MDNode *Expr, const DebugLoc &DL) const {
Index: llvm/lib/Target/X86/X86InstrInfo.h
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.h
+++ llvm/lib/Target/X86/X86InstrInfo.h
@@ -443,8 +443,7 @@
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
+  void scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const override;
 
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
Index: llvm/lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.cpp
+++ llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -8294,8 +8295,12 @@
   return true;
 }
 
-bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
-                                          const MachineInstr &Second) const {
+/// \brief Verify that the instruction pair, \param First and \param Second,
+/// should be scheduled back to back.  If either instruction is unspecified,
+/// then verify that the other instruction may be part of a pair at all.
+static bool shouldScheduleAdjacent(const X86Subtarget &Subtarget,
+                                   const MachineInstr *First,
+                                   const MachineInstr *Second) {
   // Check if this processor supports macro-fusion. Since this is a minor
   // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
   // proxy for SandyBridge+.
@@ -8308,7 +8313,12 @@
     FuseInc
   } FuseKind;
 
-  switch (Second.getOpcode()) {
+  unsigned FirstOpcode = First ?
+                         First->getOpcode() : X86::INSTRUCTION_LIST_END;
+  unsigned SecondOpcode = Second ?
+                          Second->getOpcode() : X86::INSTRUCTION_LIST_END;
+
+  switch (SecondOpcode) {
   default:
     return false;
   case X86::JE_1:
@@ -8334,7 +8344,8 @@
     FuseKind = FuseTest;
     break;
   }
-  switch (First.getOpcode()) {
+
+  switch (FirstOpcode) {
   default:
     return false;
   case X86::TEST8rr:
@@ -8450,6 +8461,49 @@
   case X86::DEC64r:
   case X86::DEC8r:
     return FuseKind == FuseInc;
+  case X86::INSTRUCTION_LIST_END:
+    return true;
+  }
+}
+
+void X86InstrInfo::scheduleAdjacent(ScheduleDAGInstrs *DAGInstrs) const {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  // For now, assume targets can only fuse with the branch.
+  SUnit &ExitSU = DAG->ExitSU;
+  MachineInstr *Branch = ExitSU.getInstr();
+  if (!shouldScheduleAdjacent(Subtarget, nullptr, Branch))
+    return;
+
+  for (SDep &PredDep : ExitSU.Preds) {
+    if (PredDep.isWeak())
+      continue;
+    SUnit &SU = *PredDep.getSUnit();
+    MachineInstr &Pred = *SU.getInstr();
+    if (!shouldScheduleAdjacent(Subtarget, &Pred, Branch))
+      continue;
+
+    // Create a single weak edge from SU to ExitSU. The only effect is to cause
+    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
+    // need to copy predecessor edges from ExitSU to SU, since top-down
+    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+    // of SU, we could create an artificial edge from the deepest root, but it
+    // hasn't been needed yet.
+    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
+    (void)Success;
+    assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+    // Adjust latency of data deps between the nodes.
+    for (SDep &PredDep : ExitSU.Preds) {
+      if (PredDep.getSUnit() == &SU)
+        PredDep.setLatency(0);
+    }
+    for (SDep &SuccDep : SU.Succs) {
+      if (SuccDep.getSUnit() == &ExitSU)
+        SuccDep.setLatency(0);
+    }
+
+    break;
   }
 }