Index: lib/CodeGen/MachineScheduler.cpp
===================================================================
--- lib/CodeGen/MachineScheduler.cpp
+++ lib/CodeGen/MachineScheduler.cpp
@@ -405,6 +405,7 @@
 
   // Initialize the context of the pass.
   MF = &mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
   PassConfig = &getAnalysis<TargetPassConfig>();
 
   if (VerifyScheduling)
Index: lib/Target/SystemZ/SystemZHazardRecognizer.h
===================================================================
--- lib/Target/SystemZ/SystemZHazardRecognizer.h
+++ lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -19,6 +19,13 @@
 // * Processor resources usage. It is beneficial to balance the use of
 // resources.
 //
+// A goal is to consider all instructions, also those outside of any
+// scheduling region. Such instructions are "advanced" past and include
+// single instructions before a scheduling region, branches etc.
+//
+// A block that has only one predecessor continues scheduling with the state
+// of it (which may be updated by emitting branches).
+//
 // ===---------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H
@@ -35,11 +42,29 @@
 
 namespace llvm {
 
-/// SystemZHazardRecognizer maintains the state during scheduling.
+class SystemZHazardRecognizer;
+typedef std::map<MachineBasicBlock*, SystemZHazardRecognizer*> MBB2HazRec;
+
+/// SystemZHazardRecognizer maintains the state for one MBB during
+/// scheduling.
 class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
 
+  // The global map of scheduler states.
+  MBB2HazRec *SchedStates;
+
   ScheduleDAGMI *DAG;
-  const TargetSchedModel *SchedModel;
+  const SystemZInstrInfo *TII;
+
+  // A SchedModel is needed before any DAG is built, while advancing past
+  // non-scheduled instructions.
+  TargetSchedModel SchedModel;
+
+  // MBB and Loop that this HazardRecognizer will operate in.
+  MachineBasicBlock *MBB;
+  const MachineLoop *Loop;
+
+  // Start of region
+  MachineBasicBlock::iterator Begin;
 
   /// Keep track of the number of decoder slots used in the current
   /// decoder group.
@@ -52,17 +77,17 @@
 
   /// Counters for the number of uops scheduled per processor
   /// resource.
-  SmallVector<int, 0> ProcResourceCounters;
+  SmallVector<int, 0> ResourceCounters;
 
   /// This is the resource with the greatest queue, which the
   /// scheduler tries to avoid.
   unsigned CriticalResourceIdx;
 
   /// Return the number of decoder slots MI requires.
-  inline unsigned getNumDecoderSlots(SUnit *SU) const;
+  inline unsigned getNumDecoderSlots(const MachineInstr *MI) const;
 
   /// Return true if MI fits into current decoder group.
-  bool fitsIntoCurrentGroup(SUnit *SU) const;
+  bool fitsIntoCurrentGroup(MachineInstr *MI) const;
 
   /// Two decoder groups per cycle are formed (for z13), meaning 2x3
   /// instructions. This function returns a number between 0 and 5,
@@ -76,29 +101,55 @@
   /// A counter of decoder groups scheduled.
   unsigned GrpCount;
 
-  unsigned getCurrGroupSize() {return CurrGroupSize;};
-
   /// Start next decoder group.
-  void nextGroup(bool DbgOutput = true);
+  void nextGroup();
 
   /// Clear all counters for processor resources.
-  void clearProcResCounters();
+  void clearResourceCounters();
+
+  /// Last emitted instruction or nullptr.
+  MachineInstr *LastEmittedMI;
 
   /// With the goal of alternating processor sides for stalling (FPd)
   /// ops, return true if it seems good to schedule an FPd op next.
   bool isFPdOpPreferred_distance(const SUnit *SU);
 
+  /// There is no SU when advancing past non-scheduled instructions.
+  void emitInstruction(MachineInstr *MI, SUnit *SU = nullptr);
+  void emitInstructionIntoCurrentDecoderGroup(MachineInstr *MI);
+  /// Emit a branch in a predecessor, and return true if it is a taken branch
+  /// to MBB.
+  bool emitIncomingBranch(MachineInstr *MI);
+
+  /// Update the scheduler state by emitting (non-scheduled) instructions
+  /// from I to NextBegin.
+  void advance(MachineBasicBlock::iterator I,
+               MachineBasicBlock::iterator NextBegin);
+
+  /// Take over state and continue scheduling from end of single predecessor.
+  void takeStateFromPred();
+
+  /// LastCall is used as an optimization in leaveMBB() so that in the case
+  /// of a call in MBB, the final state is achieved by looking at just
+  /// instructions after it.
+  MachineBasicBlock::iterator LastCall;
+
 public:
-  SystemZHazardRecognizer(const MachineSchedContext *C);
+  SystemZHazardRecognizer(const TargetSubtargetInfo *ST, MBB2HazRec *SchedS_);
+
+  void enterRegion(MachineBasicBlock *MBB_,
+                   const MachineLoop *Loop_,
+                   MachineBasicBlock::iterator Begin_);
+
+  /// Called just before scheduling begins, with the DAG.
+  void initialize(ScheduleDAGMI *dag);
 
-  void setDAG(ScheduleDAGMI *dag) {
-    DAG = dag;
-    SchedModel = dag->getSchedModel();
-  }
-  
   HazardType getHazardType(SUnit *m, int Stalls = 0) override;    
+
   void Reset() override;
-  void EmitInstruction(SUnit *SU) override;
+  void EmitInstruction(SUnit *SU) override {
+    emitInstruction(SU->getInstr(), SU);
+  }
 
   // Cost functions used by SystemZPostRASchedStrategy while
   // evaluating candidates.
@@ -107,20 +158,30 @@
   /// new decoder group, this is negative if this fits the schedule or
   /// positive if it would mean ending a group prematurely. For normal
   /// instructions this returns 0.
-  int groupingCost(SUnit *SU) const; 
+  int groupingCost(const SUnit *SU) const; 
 
   /// Return the cost of SU in regards to processor resources usage.
   /// A positive value means it would be better to wait with SU, while
   /// a negative value means it would be good to schedule SU next.
-  int resourcesCost(SUnit *SU);
+  int resourcesCost(const SUnit *SU);
 
 #ifndef NDEBUG
   // Debug dumping.
   std::string CurGroupDbg; // current group as text
   void dumpSU(SUnit *SU, raw_ostream &OS) const;
+  void dumpMI(MachineInstr *MI, raw_ostream &OS) const;
   void dumpCurrGroup(std::string Msg = "") const;
-  void dumpProcResourceCounters() const;
+  void dumpResourceCounters() const;
 #endif
+
+  /// Remeber the last (in instruction list) call in MBB.
+  void setLastCall(MachineBasicBlock::iterator Call) {
+    if (LastCall == nullptr)
+      LastCall = Call;
+  }
+
+  /// Leave MBB after scheduling is done.
+  void leaveMBB();
 };
 
 } // namespace llvm
Index: lib/Target/SystemZ/SystemZHazardRecognizer.cpp
===================================================================
--- lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -19,6 +19,13 @@
 // * Processor resources usage. It is beneficial to balance the use of
 // resources.
 //
+// A goal is to consider all instructions, also those outside of any
+// scheduling region. Such instructions are "advanced" past and include
+// single instructions before a scheduling region, branches etc.
+//
+// A block that has only one predecessor continues scheduling with the state
+// of it (which may be updated by emitting branches).
+//
 // ===---------------------------------------------------------------------===//
 
 #include "SystemZHazardRecognizer.h"
@@ -26,23 +33,60 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 // This is the limit of processor resource usage at which the
 // scheduler should try to look for other instructions (not using the
 // critical resource).
-static cl::opt<int> ProcResCostLim("procres-cost-lim", cl::Hidden,
+static cl::opt<int> ResourceCostLim("procres-cost-lim", cl::Hidden,
                                    cl::desc("The OOO window for processor "
                                             "resources during scheduling."),
                                    cl::init(8));
 
 SystemZHazardRecognizer::
-SystemZHazardRecognizer(const MachineSchedContext *C) : DAG(nullptr),
-                                                        SchedModel(nullptr) {}
+SystemZHazardRecognizer(const TargetSubtargetInfo *ST, MBB2HazRec *SchedS_)
+  : SchedStates(SchedS_), DAG(nullptr), TII(nullptr),
+    MBB(nullptr), Loop(nullptr), Begin(nullptr), CurrGroupSize(0),
+    LastFPdOpCycleIdx(UINT_MAX), GrpCount(0), LastEmittedMI(nullptr),
+    LastCall(nullptr) {
+  TII = static_cast<const SystemZInstrInfo *>(ST->getInstrInfo());
+  SchedModel.init(ST->getSchedModel(), ST, TII);
+  clearResourceCounters();
+}
+
+void SystemZHazardRecognizer::enterRegion(MachineBasicBlock *MBB_,
+                                          const MachineLoop *Loop_,
+                                          MachineBasicBlock::iterator Begin_) {
+  Reset();
+  MBB = MBB_;
+  Loop = Loop_;
+  Begin = Begin_;
+}
+
+void SystemZHazardRecognizer::initialize(ScheduleDAGMI *dag) {
+  DAG = dag;
+
+  // There may be non-scheduled instructions before Begin. Look backwards
+  // until beginning of block or a call.
+  MachineBasicBlock::iterator PreRegBegin = Begin;
+  for (; PreRegBegin != MBB->begin(); --PreRegBegin) {
+    if (std::prev(PreRegBegin)->isCall())
+      break;
+  }
+  
+  // If this is top-most in MBB, try to take over the state from a single
+  // predecessor.
+  if (PreRegBegin == MBB->begin())
+    takeStateFromPred();
+
+  // Emit any instructions before Begin.
+  advance(PreRegBegin, Begin);
+}
 
 unsigned SystemZHazardRecognizer::
-getNumDecoderSlots(SUnit *SU) const {
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+getNumDecoderSlots(const MachineInstr *MI) const {
+  const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI);
+
   if (!SC->isValid())
     return 0; // IMPLICIT_DEF / KILL -- will not make impact in output.
 
@@ -58,27 +102,33 @@
 
 unsigned SystemZHazardRecognizer::getCurrCycleIdx() {
   unsigned Idx = CurrGroupSize;
-  if (GrpCount % 2)
+  if ((GrpCount % 2) != 0)
     Idx += 3;
   return Idx;
 }
 
 ScheduleHazardRecognizer::HazardType SystemZHazardRecognizer::
 getHazardType(SUnit *m, int Stalls) {
-  return (fitsIntoCurrentGroup(m) ? NoHazard : Hazard);
+  return (fitsIntoCurrentGroup(m->getInstr()) ? NoHazard : Hazard);
 }
 
 void SystemZHazardRecognizer::Reset() {
+  DAG = nullptr;
+  MBB = nullptr;
+  Loop = nullptr;
+  Begin = nullptr;
   CurrGroupSize = 0;
-  clearProcResCounters();
-  GrpCount = 0;
+  clearResourceCounters();
   LastFPdOpCycleIdx = UINT_MAX;
-  DEBUG(CurGroupDbg = "";);
+  GrpCount = 0;
+  LastEmittedMI = nullptr;
+  LastCall = nullptr;
+  DEBUG (CurGroupDbg = "";);
 }
 
 bool
-SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+SystemZHazardRecognizer::fitsIntoCurrentGroup(MachineInstr *MI) const {
+  const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI);
   if (!SC->isValid())
     return true;
 
@@ -87,19 +137,19 @@
   if (SC->BeginGroup)
     return (CurrGroupSize == 0);
 
-  // Since a full group is handled immediately in EmitInstruction(),
+  // Since a full group is handled immediately in emitInstruction(),
   // SU should fit into current group. NumSlots should be 1 or 0,
   // since it is not a cracked or expanded instruction.
-  assert ((getNumDecoderSlots(SU) <= 1) && (CurrGroupSize < 3) &&
+  assert ((getNumDecoderSlots(MI) <= 1) && (CurrGroupSize < 3) &&
           "Expected normal instruction to fit in non-full group!");
 
   return true;
 }
 
-void SystemZHazardRecognizer::nextGroup(bool DbgOutput) {
+void SystemZHazardRecognizer::nextGroup() {
   if (CurrGroupSize > 0) {
-    DEBUG(dumpCurrGroup("Completed decode group"));
-    DEBUG(CurGroupDbg = "";);
+    DEBUG (dumpCurrGroup("Completed decode group"));
+    DEBUG (CurGroupDbg = "";);
 
     GrpCount++;
 
@@ -107,35 +157,39 @@
     CurrGroupSize = 0;
 
     // Decrease counters for execution units by one.
-    for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
-      if (ProcResourceCounters[i] > 0)
-        ProcResourceCounters[i]--;
+    for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i)
+      if (ResourceCounters[i] > 0)
+        ResourceCounters[i]--;
 
     // Clear CriticalResourceIdx if it is now below the threshold.
     if (CriticalResourceIdx != UINT_MAX &&
-        (ProcResourceCounters[CriticalResourceIdx] <=
-         ProcResCostLim))
+        (ResourceCounters[CriticalResourceIdx] <= ResourceCostLim))
       CriticalResourceIdx = UINT_MAX;
   }
 
-  DEBUG(if (DbgOutput)
-          dumpProcResourceCounters(););
+  DEBUG (dumpResourceCounters(););
 }
 
 #ifndef NDEBUG // Debug output
-void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
+void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const{
   OS << "SU(" << SU->NodeNum << "):";
-  OS << SchedModel->getInstrInfo()->getName(SU->getInstr()->getOpcode());
+  dumpMI(SU->getInstr(), OS);
+  if (SU->isUnbuffered)
+    OS << "/Unbuffered";
+}
+
+void SystemZHazardRecognizer::dumpMI(MachineInstr *MI, raw_ostream &OS) const{
+  OS << TII->getName(MI->getOpcode());
+  const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI);
 
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
   if (!SC->isValid())
     return;
   
   for (TargetSchedModel::ProcResIter
-         PI = SchedModel->getWriteProcResBegin(SC),
-         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+         PI = SchedModel.getWriteProcResBegin(SC),
+         PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
     const MCProcResourceDesc &PRD =
-      *SchedModel->getProcResource(PI->ProcResourceIdx);
+      *SchedModel.getProcResource(PI->ProcResourceIdx);
     std::string FU(PRD.Name);
     // trim e.g. Z13_FXaUnit -> FXa
     FU = FU.substr(FU.find("_") + 1);
@@ -154,8 +208,6 @@
     OS << "/BeginsGroup";
   else if (SC->EndGroup)
     OS << "/EndsGroup";
-  if (SU->isUnbuffered)
-    OS << "/Unbuffered";
 }
 
 void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const {
@@ -172,11 +224,11 @@
   }
 }
 
-void SystemZHazardRecognizer::dumpProcResourceCounters() const {
+void SystemZHazardRecognizer::dumpResourceCounters() const {
   bool any = false;
 
-  for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
-    if (ProcResourceCounters[i] > 0) {
+  for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i)
+    if (ResourceCounters[i] > 0) {
       any = true;
       break;
     }
@@ -185,93 +237,146 @@
     return;
 
   dbgs() << "+++ Resource counters:\n";
-  for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
-    if (ProcResourceCounters[i] > 0) {
+  for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i)
+    if (ResourceCounters[i] > 0) {
       dbgs() << "+++ Extra schedule for execution unit "
-             << SchedModel->getProcResource(i)->Name
-             << ": " << ProcResourceCounters[i] << "\n";
-      any = true;
+             << SchedModel.getProcResource(i)->Name
+             << ": " << ResourceCounters[i] << "\n";
     }
 }
 #endif //NDEBUG
 
-void SystemZHazardRecognizer::clearProcResCounters() {
-  ProcResourceCounters.assign(SchedModel->getNumProcResourceKinds(), 0);
+void SystemZHazardRecognizer::clearResourceCounters() {
+  ResourceCounters.assign(SchedModel.getNumProcResourceKinds(), 0);
   CriticalResourceIdx = UINT_MAX;
 }
 
-// Update state with SU as the next scheduled unit.
+// Update state with MI as next instruction. If SU is null, this
+// is e.g. a scheduling boundary.
 void SystemZHazardRecognizer::
-EmitInstruction(SUnit *SU) {
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
-  DEBUG( dumpCurrGroup("Decode group before emission"););
+emitInstruction(MachineInstr *MI, SUnit *SU) {
+  assert (!MI->isBranch() && "Did not expect a branch here.");
+  assert (SU == nullptr || MI == SU->getInstr());
 
-  // If scheduling an SU that must begin a new decoder group, move on
-  // to next group.
-  if (!fitsIntoCurrentGroup(SU))
+  DEBUG (dumpCurrGroup("Decode group before emission"););
+
+  // If scheduling an MI that must begin a new decoder group, do so.
+  if (!fitsIntoCurrentGroup(MI))
     nextGroup();
 
-  DEBUG( dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
-         dbgs() << "\n";
-         raw_string_ostream cgd(CurGroupDbg);
-         if (CurGroupDbg.length())
-           cgd << ", ";
-         dumpSU(SU, cgd););
+  DEBUG (if (SU != nullptr) {
+      dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
+      dbgs() << "\n";
+      raw_string_ostream cgd(CurGroupDbg);
+      if (CurGroupDbg.length())
+        cgd << ", ";
+      dumpSU(SU, cgd);
+    } else {
+      dbgs() << "+++ Advancing past: ";
+      dumpMI(MI, dbgs());
+      dbgs() << "\n";
+
+      raw_string_ostream cgd(CurGroupDbg);
+      if (CurGroupDbg.length())
+        cgd << ", ";
+      cgd << TII->getName(MI->getOpcode());
+    });
+
+  LastEmittedMI = MI;
 
   // After returning from a call, we don't know much about the state.
-  if (SU->getInstr()->isCall()) {
+  if (MI->isCall()) {
     DEBUG (dbgs() << "+++ Clearing state after call.\n";);
-    clearProcResCounters();
+    clearResourceCounters();
     LastFPdOpCycleIdx = UINT_MAX;
-    CurrGroupSize += getNumDecoderSlots(SU);
+    CurrGroupSize += getNumDecoderSlots(MI);
     assert (CurrGroupSize <= 3);
     nextGroup();
     return;
   }
 
+  // Make note of an instruction that uses a blocking resource (FPd).
+  if ((SU != nullptr && SU->isUnbuffered)) {
+    LastFPdOpCycleIdx = getCurrCycleIdx();
+    DEBUG (dbgs() << "+++ Last FPd cycle index: "
+           << LastFPdOpCycleIdx << "\n";);
+  }
+
+  emitInstructionIntoCurrentDecoderGroup(MI);
+}
+
+void SystemZHazardRecognizer::
+emitInstructionIntoCurrentDecoderGroup(MachineInstr *MI) {
+  const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI);
+
   // Increase counter for execution unit(s).
   for (TargetSchedModel::ProcResIter
-         PI = SchedModel->getWriteProcResBegin(SC),
-         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+         PI = SchedModel.getWriteProcResBegin(SC),
+         PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
     // Don't handle FPd together with the other resources.
-    if (SchedModel->getProcResource(PI->ProcResourceIdx)->BufferSize == 1)
+    if (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize == 1)
       continue;
     int &CurrCounter =
-      ProcResourceCounters[PI->ProcResourceIdx];
+      ResourceCounters[PI->ProcResourceIdx];
     CurrCounter += PI->Cycles;
     // Check if this is now the new critical resource.
-    if ((CurrCounter > ProcResCostLim) &&
+    if ((CurrCounter > ResourceCostLim) &&
         (CriticalResourceIdx == UINT_MAX ||
          (PI->ProcResourceIdx != CriticalResourceIdx &&
-          CurrCounter >
-          ProcResourceCounters[CriticalResourceIdx]))) {
-      DEBUG( dbgs() << "+++ New critical resource: "
-             << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
+          CurrCounter > ResourceCounters[CriticalResourceIdx]))) {
+      DEBUG (dbgs() << "+++ New critical resource: "
+             << SchedModel.getProcResource(PI->ProcResourceIdx)->Name
              << "\n";);
       CriticalResourceIdx = PI->ProcResourceIdx;
     }
   }
 
-  // Make note of an instruction that uses a blocking resource (FPd).
-  if (SU->isUnbuffered) {
-    LastFPdOpCycleIdx = getCurrCycleIdx();
-    DEBUG (dbgs() << "+++ Last FPd cycle index: "
-           << LastFPdOpCycleIdx << "\n";);
-  }
-
-  // Insert SU into current group by increasing number of slots used
+  // Insert MI into current group by increasing number of slots used
   // in current group.
-  CurrGroupSize += getNumDecoderSlots(SU);
+  CurrGroupSize += getNumDecoderSlots(MI);
   assert (CurrGroupSize <= 3);
 
-  // Check if current group is now full/ended. If so, move on to next
-  // group to be ready to evaluate more candidates.
+  // Check if current group is now full/ended. If so, reset counter to
+  // be ready to evaluate candidates again.
   if (CurrGroupSize == 3 || SC->EndGroup)
     nextGroup();
 }
 
-int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+bool SystemZHazardRecognizer::emitIncomingBranch(MachineInstr *MI) {
+  DEBUG (dbgs() << "+++ Emitting incoming branch: "; MI->dump(););
+
+  // If scheduling an MI that must begin a new decoder group, do so.
+  if (!fitsIntoCurrentGroup(MI))
+    nextGroup();
+
+  DEBUG ({ raw_string_ostream cgd(CurGroupDbg);
+           if (CurGroupDbg.length())
+             cgd << ", ";
+           dumpMI(MI, cgd); });
+
+  emitInstructionIntoCurrentDecoderGroup(MI);
+
+  if (MI->isBranch() &&
+      (TII->getBranchInfo(*MI).Target->isReg() || // Relative branch
+       TII->getBranchInfo(*MI).Target->getMBB() == MBB)) {
+    // Taken branch from predecessor
+    if (CurrGroupSize > 0)
+      nextGroup();
+    return true;
+  }
+
+  assert ((MI->isBranch() || MI->isReturn() || MI->getOpcode() == SystemZ::CondTrap) &&
+          "Scheduler: expected a branch or conditional return/trap");
+
+  // NT branches end group after first decoder slot.
+  if (CurrGroupSize == 2)
+    nextGroup();
+
+  return false;
+}
+
+int SystemZHazardRecognizer::groupingCost(const SUnit *SU) const {
+  const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(SU->getInstr());
   if (!SC->isValid())
     return 0;
   
@@ -287,7 +392,7 @@
   // end the group prematurely.
   if (SC->EndGroup) {
     unsigned resultingGroupSize =
-      (CurrGroupSize + getNumDecoderSlots(SU));
+      (CurrGroupSize + getNumDecoderSlots(SU->getInstr()));
     if (resultingGroupSize < 3)
       return (3 - resultingGroupSize);
     return -1;
@@ -312,10 +417,10 @@
 }
 
 int SystemZHazardRecognizer::
-resourcesCost(SUnit *SU) {
+resourcesCost(const SUnit *SU) {
   int Cost = 0;
 
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(SU->getInstr());
   if (!SC->isValid())
     return 0;
 
@@ -326,8 +431,8 @@
   // For other instructions, give a cost to the use of the critical resource.
   else if (CriticalResourceIdx != UINT_MAX) {
     for (TargetSchedModel::ProcResIter
-           PI = SchedModel->getWriteProcResBegin(SC),
-           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI)
+           PI = SchedModel.getWriteProcResBegin(SC),
+           PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI)
       if (PI->ProcResourceIdx == CriticalResourceIdx)
         Cost = PI->Cycles;
   }
@@ -335,3 +440,89 @@
   return Cost;
 }
 
+void SystemZHazardRecognizer::
+advance(MachineBasicBlock::iterator I,
+        MachineBasicBlock::iterator NextBegin) {
+  for (; I != NextBegin; ++I) {
+    if (I->isPosition() || I->isDebugValue())
+      continue;
+    emitInstruction(&*I);
+  }
+} 
+
+// Try to find a single predecessor that would be interesting for the
+// scheduler in the top-most region of MBB.
+static MachineBasicBlock *getSingleSchedPred(MachineBasicBlock *MBB,
+                                             const MachineLoop *Loop) {
+  if (MBB->pred_size() == 1)
+    return *MBB->pred_begin();
+
+  // The loop header has two predecessors, return the latch, but not for a
+  // single block loop.
+  if (MBB->pred_size() == 2 && Loop != nullptr && Loop->getHeader() == MBB) {
+    for (auto I = MBB->pred_begin(); I != MBB->pred_end(); ++I)
+      if (Loop->contains(*I))
+        return (*I == MBB ? nullptr : *I);
+  }
+
+  return nullptr;
+}
+
+void SystemZHazardRecognizer::
+takeStateFromPred() {
+  // Take state from single predecessor MBB, if it has been scheduled.
+  MachineBasicBlock *SinglePredMBB = getSingleSchedPred(MBB, Loop);
+  assert (!SinglePredMBB || !Loop || Loop->contains(SinglePredMBB));
+  if (SinglePredMBB == nullptr ||
+      SchedStates->find(SinglePredMBB) == SchedStates->end())
+    return;
+
+  // Be optimistic and assume that branch prediction will generally do "the
+  // right thing".
+
+  // Get incoming scheduler state.
+  SystemZHazardRecognizer *incoming = (*SchedStates)[SinglePredMBB];
+  DEBUG (dbgs() << "+++ Continued scheduling from MBB#"
+         << incoming->MBB->getNumber() << "\n";);
+
+  // Current decoder group
+  CurrGroupSize = incoming->CurrGroupSize;
+  DEBUG (CurGroupDbg = incoming->CurGroupDbg;);
+
+  // Processor resources
+  ResourceCounters = incoming->ResourceCounters;
+  CriticalResourceIdx = incoming->CriticalResourceIdx;
+
+  // FPd
+  LastFPdOpCycleIdx = incoming->LastFPdOpCycleIdx;
+  GrpCount = incoming->GrpCount;
+
+  // Emit incoming terminator(s).
+  for (MachineBasicBlock::iterator I = incoming->MBB->getFirstTerminator();
+       I != incoming->MBB->end(); I++)
+    if (emitIncomingBranch(&*I))
+      break;
+}
+
+void SystemZHazardRecognizer::leaveMBB() {
+  DAG = nullptr;
+
+  MachineBasicBlock::iterator I;
+  if (LastEmittedMI != nullptr)
+    // If scheduling was done, emit everything after the region.
+    I = std::next(MachineBasicBlock::iterator(LastEmittedMI));
+  else if (LastCall != nullptr)
+    // Otherwise, emit everything after the last call in MBB, if there is
+    // one.
+    I = std::next(MachineBasicBlock::iterator(LastCall));
+  else {
+    // Get the correct final state by emitting the whole MBB.
+    takeStateFromPred();
+    I = MBB->begin();
+  }
+
+  // Advance to first terminator. The successor block will handle them in
+  // takeStateFromPred().
+  advance(I, MBB->getFirstTerminator());
+}
+
Index: lib/Target/SystemZ/SystemZMachineScheduler.h
===================================================================
--- lib/Target/SystemZ/SystemZMachineScheduler.h
+++ lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -7,11 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// -------------------------- Post RA scheduling ---------------------------- //
+// -------------------------- Post RA scheduling ----------------------------
 // SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
-// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
-// implementation that looks to optimize decoder grouping and balance the
-// usage of processor resources.
+// the MachineScheduler. It has a sorted Available set of SUs and a
+// pickNode() implementation that looks to optimize decoder grouping and
+// balance the usage of processor resources. Scheduler states are saved for
+// the end region of each MBB, so that a successor block can learn from it.
 //===----------------------------------------------------------------------===//
 
 #include "SystemZHazardRecognizer.h"
@@ -29,7 +30,7 @@
 /// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
 class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   ScheduleDAGMI *DAG;
-  
+
   /// A candidate during instruction evaluation.
   struct Candidate {
     SUnit *SU = nullptr;
@@ -79,12 +80,34 @@
   /// The set of available SUs to schedule next.
   SUSet Available;
 
-  // HazardRecognizer that tracks the scheduler state for the current
-  // region.
-  SystemZHazardRecognizer HazardRec;
-  
+  /// Maintain hazard recognizers for all blocks, so that the scheduler state
+  /// can be maintained past BB boundaries when appropariate.
+  MBB2HazRec SchedStates;
+
+  /// HazardRecognizer that tracks the scheduler state for the bottom-most
+  /// region of each MBB.
+  SystemZHazardRecognizer *HazardRec;
+
+  /// A temporary HazardRecognizer used for regions that are separated (by a
+  /// call) from the end region of the MBB.
+  SystemZHazardRecognizer TmpHazRec;
+
+  /// Since there is no virtual leaveRegion() method, use a pointer to check
+  /// when scheduler has changed MBB.
+  MachineBasicBlock *PreviouslyVisitedMBB;
+
+  /// Loops are checked so that headers can be identified in
+  /// takeStateFromPred().
+  const MachineLoopInfo *MLI;
+
 public:
   SystemZPostRASchedStrategy(const MachineSchedContext *C);
+  virtual ~SystemZPostRASchedStrategy();
+
+  /// Called for a region before scheduling.
+  void initPolicy(MachineBasicBlock::iterator Begin,
+                  MachineBasicBlock::iterator End,
+                  unsigned NumRegionInstrs) override;
 
   /// PostRA scheduling does not track pressure.
   bool shouldTrackPressure() const override { return false; }
Index: lib/Target/SystemZ/SystemZMachineScheduler.cpp
===================================================================
--- lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -7,18 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// -------------------------- Post RA scheduling ---------------------------- //
+// -------------------------- Post RA scheduling ----------------------------
 // SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
-// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
-// implementation that looks to optimize decoder grouping and balance the
-// usage of processor resources.
-//===----------------------------------------------------------------------===//
+// the MachineScheduler. It has a sorted Available set of SUs and a
+// pickNode() implementation that looks to optimize decoder grouping and
+// balance the usage of processor resources. Scheduler states are saved for
+// the end region of each MBB, so that a successor block can learn from it.
+// ===----------------------------------------------------------------------===//
 
 #include "SystemZMachineScheduler.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 #ifndef NDEBUG
 // Print the set of SUs
@@ -36,12 +37,58 @@
 
 SystemZPostRASchedStrategy::
 SystemZPostRASchedStrategy(const MachineSchedContext *C)
-  : DAG(nullptr), HazardRec(C) {}
+  : DAG(nullptr), HazardRec(nullptr),
+    TmpHazRec(&C->MF->getSubtarget(), &SchedStates),
+    PreviouslyVisitedMBB(nullptr), MLI(C->MLI) {}
+
+SystemZPostRASchedStrategy::~SystemZPostRASchedStrategy() {
+  // Delete hazard recognizers kept around for each MBB.
+  for (auto I : SchedStates) {
+    SystemZHazardRecognizer *hazrec = I.second;
+    delete hazrec;
+  }
+}
+
+void SystemZPostRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
+                                            MachineBasicBlock::iterator End,
+                                            unsigned NumRegionInstrs) {
+  MachineBasicBlock *MBB = Begin->getParent();
+  const MachineLoop *Loop = MLI->getLoopFor(MBB);
+
+  // If a new MBB has been entered, finalize the previous MBB.
+  if (PreviouslyVisitedMBB != nullptr && PreviouslyVisitedMBB != MBB)
+    SchedStates.find(PreviouslyVisitedMBB)->second->leaveMBB();
+  PreviouslyVisitedMBB = MBB;
+
+  // We can maintain the scheder state perfectly even when scheduling regions
+  // in reverse order (bottom-up) in MBB, because the only scheduling
+  // boundaries we have are calls, which simply reset the state.
+  if (SchedStates.find(MBB) != SchedStates.end()) {
+    // Use the temporary HazardRecognizer for any regions above calls.
+    DEBUG (dbgs() << "+++ Continuing in MBB#" << MBB->getNumber());
+    HazardRec = &TmpHazRec;
+  } else {
+    // First time in MBB is the bottom-most region. Make a new
+    // HazardRecognizer and save it for use by successor block.
+    DEBUG (dbgs() << "+++ Entering MBB#" << MBB->getNumber());
+    HazardRec = new SystemZHazardRecognizer(&MBB->getParent()->getSubtarget(),
+                                            &SchedStates);
+    SchedStates[MBB] = HazardRec;
+  }
+
+  DEBUG (if(Loop && Loop->getHeader() == MBB)
+           dbgs() << " (Loop header)";
+         dbgs() << ":\n";);
+
+  if (End != MBB->end() && End->isCall())
+    SchedStates[MBB]->setLastCall(End);
+
+  HazardRec->enterRegion(MBB, Loop, Begin);
+}
 
 void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
-  HazardRec.setDAG(dag);
-  HazardRec.Reset();
+  HazardRec->initialize(dag);
 }
 
 // Pick the next node to schedule.
@@ -55,31 +102,32 @@
   // If only one choice, return it.
   if (Available.size() == 1) {
     DEBUG (dbgs() << "+++ Only one: ";
-           HazardRec.dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
+           HazardRec->dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
     return *Available.begin();
   }
 
   // All nodes that are possible to schedule are stored by in the
   // Available set.
-  DEBUG(dbgs() << "+++ Available: "; Available.dump(HazardRec););
+  DEBUG (dbgs() << "+++ Available: "; Available.dump(*HazardRec););
 
   Candidate Best;
   for (auto *SU : Available) {
 
     // SU is the next candidate to be compared against current Best.
-    Candidate c(SU, HazardRec);
+    Candidate c(SU, *HazardRec);
 
     // Remeber which SU is the best candidate.
     if (Best.SU == nullptr || c < Best) {
       Best = c;
-      DEBUG(dbgs() << "+++ Best sofar: ";
-            HazardRec.dumpSU(Best.SU, dbgs());
-            if (Best.GroupingCost != 0)
-              dbgs() << "\tGrouping cost:" << Best.GroupingCost;
-            if (Best.ResourcesCost != 0)
-              dbgs() << " Resource cost:" << Best.ResourcesCost;
-            dbgs() << " Height:" << Best.SU->getHeight();
-            dbgs() << "\n";);
+      DEBUG (dbgs() << "+++ Best sofar: ";
+             HazardRec->dumpSU(Best.SU, dbgs());
+             if (Best.GroupingCost != 0)
+               dbgs() << "\tGrouping cost:" << Best.GroupingCost;
+             if (Best.ResourcesCost != 0)
+               dbgs() << Best.ResourcesCost;
+
+             dbgs() << " Height:" << Best.SU->getHeight();
+             dbgs() << "\n";);
     }
 
     // Once we know we have seen all SUs that affect grouping or use unbuffered
@@ -134,11 +182,11 @@
 }
 
 void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
-  DEBUG(dbgs() << "+++ Scheduling SU(" << SU->NodeNum << ")\n";);
+  DEBUG (dbgs() << "+++ Scheduling SU(" << SU->NodeNum << ")\n";);
 
   // Remove SU from Available set and update HazardRec.
   Available.erase(SU);
-  HazardRec.EmitInstruction(SU);
+  HazardRec->EmitInstruction(SU);
 }
 
 void SystemZPostRASchedStrategy::releaseTopNode(SUnit *SU) {
Index: test/CodeGen/SystemZ/int-cmp-48.ll
===================================================================
--- test/CodeGen/SystemZ/int-cmp-48.ll
+++ test/CodeGen/SystemZ/int-cmp-48.ll
@@ -29,8 +29,8 @@
 define void @f2(i8 *%src) {
 ; CHECK-LABEL: f2:
 ; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
-; CHECK: tmll [[REG]], 1
-; CHECK: mvi 0(%r2), 0
+; CHECK-DAG: mvi 0(%r2), 0
+; CHECK-DAG: tmll [[REG]], 1
 ; CHECK: ber %r14
 ; CHECK: br %r14
 entry: