Index: lib/CodeGen/MachinePipeliner.cpp
===================================================================
--- lib/CodeGen/MachinePipeliner.cpp
+++ lib/CodeGen/MachinePipeliner.cpp
@@ -125,6 +125,7 @@
 
 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
 STATISTIC(NumPipelined, "Number of loops software pipelined");
+STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
 
 /// A command line option to turn software pipelining on or off.
 static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
@@ -241,6 +242,8 @@
   struct NodeInfo {
     int ASAP = 0;
     int ALAP = 0;
+    int ZeroLatencyDepth = 0;
+    int ZeroLatencyHeight = 0;
 
     NodeInfo() = default;
   };
@@ -320,9 +323,21 @@
   /// The depth, in the dependence graph, for a node.
   int getDepth(SUnit *Node) { return Node->getDepth(); }
 
+  /// The maximum unweighted length of a path from an arbitrary node to the
+  /// given node in which each edge has latency 0
+  int getZeroLatencyDepth(SUnit *Node) {
+    return ScheduleInfo[Node->NodeNum].ZeroLatencyDepth;
+  }
+
   /// The height, in the dependence graph, for a node.
   int getHeight(SUnit *Node) { return Node->getHeight(); }
 
+  /// The maximum unweighted length of a path from the given node to an
+  /// arbitrary node in which each edge has latency 0
+  int getZeroLatencyHeight(SUnit *Node) {
+    return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
+  }
+
   /// Return true if the dependence is a back-edge in the data dependence graph.
   /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
   /// using an anti dependence from a Phi to an instruction.
@@ -404,6 +419,7 @@
   void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
                          SetVector<SUnit *> &NodesAdded);
   void computeNodeOrder(NodeSetType &NodeSets);
+  bool isValidNodeOrder(const NodeSetType &Circuits) const;
   bool schedulePipeline(SMSchedule &Schedule);
   void generatePipelinedLoop(SMSchedule &Schedule);
   void generateProlog(SMSchedule &Schedule, unsigned LastStage,
@@ -863,6 +879,7 @@
 
   NodeSetType NodeSets;
   findCircuits(NodeSets);
+  NodeSetType Circuits = NodeSets;
 
   // Calculate the MII.
   unsigned ResMII = calculateResMII();
@@ -916,6 +933,18 @@
 
   computeNodeOrder(NodeSets);
 
+  // check for node order issues
+  bool ValidNodeOrder = isValidNodeOrder(Circuits);
+
+  // Note that we do not use an assert to check for invalid node orders.
+  // The reason is that although an invalid node oder may prevent
+  // the pipeliner from finding a pipelined schedule for arbitrary II,
+  // it does not lead to the generation of incorrect code.
+  DEBUG({
+    if (!ValidNodeOrder)
+      dbgs() << "Invalid node order found!\n";
+  });
+
   SMSchedule Schedule(Pass.MF);
   Scheduled = schedulePipeline(Schedule);
 
@@ -1568,42 +1597,52 @@
   });
 
   int maxASAP = 0;
-  // Compute ASAP.
+  // Compute ASAP and ZeroLatencyDepth.
   for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
                                                   E = Topo.end();
        I != E; ++I) {
     int asap = 0;
+    int zeroLatencyDepth = 0;
     SUnit *SU = &SUnits[*I];
     for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
                                     EP = SU->Preds.end();
          IP != EP; ++IP) {
+      SUnit *pred = IP->getSUnit();
+      if (getLatency(SU, *IP) == 0)
+        zeroLatencyDepth =
+            std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
       if (ignoreDependence(*IP, true))
         continue;
-      SUnit *pred = IP->getSUnit();
       asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
                                   getDistance(pred, SU, *IP) * MII));
     }
     maxASAP = std::max(maxASAP, asap);
     ScheduleInfo[*I].ASAP = asap;
+    ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
   }
 
-  // Compute ALAP and MOV.
+  // Compute ALAP, ZeroLatencyHeight, and MOV.
   for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
                                                           E = Topo.rend();
        I != E; ++I) {
     int alap = maxASAP;
+    int zeroLatencyHeight = 0;
     SUnit *SU = &SUnits[*I];
     for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
                                     ES = SU->Succs.end();
          IS != ES; ++IS) {
+      SUnit *succ = IS->getSUnit();
+      if (getLatency(SU, *IS) == 0)
+        zeroLatencyHeight =
+            std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
       if (ignoreDependence(*IS, true))
         continue;
-      SUnit *succ = IS->getSUnit();
       alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
                                   getDistance(SU, succ, *IS) * MII));
     }
 
     ScheduleInfo[*I].ALAP = alap;
+    ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
   }
 
   // After computing the node functions, compute the summary for each node set.
@@ -1618,6 +1657,8 @@
       dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
       dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
       dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
+      dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
+      dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
     }
   });
 }
@@ -1986,14 +2027,6 @@
     }
 }
 
-/// Return true if Inst1 defines a value that is used in Inst2.
-static bool hasDataDependence(SUnit *Inst1, SUnit *Inst2) {
-  for (auto &SI : Inst1->Succs)
-    if (SI.getSUnit() == Inst2 && SI.getKind() == SDep::Data)
-      return true;
-  return false;
-}
-
 /// Compute an ordered list of the dependence graph nodes, which
 /// indicates the order that the nodes will be scheduled.  This is a
 /// two-level algorithm. First, a partial order is created, which
@@ -2040,18 +2073,20 @@
     while (!R.empty()) {
       if (Order == TopDown) {
         // Choose the node with the maximum height.  If more than one, choose
-        // the node with the lowest MOV. If still more than one, check if there
-        // is a dependence between the instructions.
+        // the node with the maximum ZeroLatencyHeight. If still more than one,
+        // choose the node with the lowest MOV.
         while (!R.empty()) {
           SUnit *maxHeight = nullptr;
           for (SUnit *I : R) {
             if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
               maxHeight = I;
             else if (getHeight(I) == getHeight(maxHeight) &&
-                     getMOV(I) < getMOV(maxHeight) &&
-                     !hasDataDependence(maxHeight, I))
+                     getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
               maxHeight = I;
-            else if (hasDataDependence(I, maxHeight))
+            else if (getHeight(I) == getHeight(maxHeight) &&
+                     getZeroLatencyHeight(I) ==
+                         getZeroLatencyHeight(maxHeight) &&
+                     getMOV(I) < getMOV(maxHeight))
               maxHeight = I;
           }
           NodeOrder.insert(maxHeight);
@@ -2084,18 +2119,19 @@
           R.insert(N.begin(), N.end());
       } else {
         // Choose the node with the maximum depth.  If more than one, choose
-        // the node with the lowest MOV. If there is still more than one, check
-        // for a dependence between the instructions.
+        // the node with the maximum ZeroLatencyDepth. If still more than one,
+        // choose the node with the lowest MOV.
         while (!R.empty()) {
           SUnit *maxDepth = nullptr;
           for (SUnit *I : R) {
             if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
               maxDepth = I;
             else if (getDepth(I) == getDepth(maxDepth) &&
-                     getMOV(I) < getMOV(maxDepth) &&
-                     !hasDataDependence(I, maxDepth))
+                     getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
               maxDepth = I;
-            else if (hasDataDependence(maxDepth, I))
+            else if (getDepth(I) == getDepth(maxDepth) &&
+                     getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
+                     getMOV(I) < getMOV(maxDepth))
               maxDepth = I;
           }
           NodeOrder.insert(maxDepth);
@@ -3864,6 +3900,88 @@
   return true;
 }
 
+/// A property of the node order in swing-modulo-scheduling is
+/// that for nodes outside circuits the following holds:
+/// none of them is scheduled after both a successor and a
+/// predecessor.
+/// The method below checks whether the property is met.
+bool SwingSchedulerDAG::isValidNodeOrder(const NodeSetType &Circuits) const {
+
+  // a sorted vector that maps each SUnit to its index in the NodeOrder
+  typedef std::pair<SUnit *, unsigned> UnitIndex;
+  std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
+
+  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
+    Indices.push_back(std::make_pair(NodeOrder[i], i));
+
+  auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
+    return std::get<0>(i1) < std::get<0>(i2);
+  };
+
+  // sort, so that we can perform a binary search
+  std::sort(Indices.begin(), Indices.end(), CompareKey);
+
+  bool Valid = true;
+  // for each SUnit in the NodeOrder, check whether
+  // it appears after both a successor and a predecessor
+  // of the SUnit. If this is the case, and the SUnit
+  // is not part of circuit, then the NodeOrder is not
+  // valid.
+  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
+    SUnit *SU = NodeOrder[i];
+    unsigned Index = i;
+
+    bool PredBefore = false;
+    bool SuccBefore = false;
+
+    SUnit *Succ;
+    SUnit *Pred;
+
+    for (SDep &PredEdge : SU->Preds) {
+      SUnit *PredSU = PredEdge.getSUnit();
+      unsigned PredIndex =
+          std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
+                                        std::make_pair(PredSU, 0), CompareKey));
+      if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
+        PredBefore = true;
+        Pred = PredSU;
+        break;
+      }
+    }
+
+    for (SDep &SuccEdge : SU->Succs) {
+      SUnit *SuccSU = SuccEdge.getSUnit();
+      unsigned SuccIndex =
+          std::get<1>(*std::lower_bound(Indices.begin(), Indices.end(),
+                                        std::make_pair(SuccSU, 0), CompareKey));
+      if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
+        SuccBefore = true;
+        Succ = SuccSU;
+        break;
+      }
+    }
+
+    if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
+      // instructions in circuits are allowed to be scheduled
+      // after both a successor and predecessor.
+      bool InCircuit = std::any_of(
+          Circuits.begin(), Circuits.end(),
+          [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
+      if (InCircuit)
+        DEBUG(dbgs() << "In a circuit, predecessor ";);
+      else {
+        Valid = false;
+        NumNodeOrderIssues++;
+        DEBUG(dbgs() << "Predecessor ";);
+      }
+      DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
+                   << " are scheduled before node " << SU->NodeNum << "\n";);
+    }
+  }
+
+  return Valid;
+}
+
 /// Attempt to fix the degenerate cases when the instruction serialization
 /// causes the register lifetimes to overlap. For example,
 ///   p' = store_pi(p, b)
Index: test/CodeGen/Hexagon/SUnit-boundary-prob.ll
===================================================================
--- test/CodeGen/Hexagon/SUnit-boundary-prob.ll
+++ test/CodeGen/Hexagon/SUnit-boundary-prob.ll
@@ -1,8 +1,12 @@
-; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60  < %s | FileCheck %s
+; REQUIRES: asserts
+; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60 --stats -o - 2>&1  < %s | FileCheck %s
 ; This was aborting while processing SUnits.
 
 ; CHECK: vmem
 
+; CHECK-NOT: Number of node order issues found
+; CHECK: Number of loops software pipelined
+; CHECK-NOT: Number of node order issues found
 source_filename = "bugpoint-output-bdb0052.bc"
 target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
 target triple = "hexagon-unknown--elf"
Index: test/CodeGen/Hexagon/frame-offset-overflow.ll
===================================================================
--- test/CodeGen/Hexagon/frame-offset-overflow.ll
+++ test/CodeGen/Hexagon/frame-offset-overflow.ll
@@ -1,9 +1,14 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: asserts
+; RUN: llc -march=hexagon --stats -o - 2>&1 < %s | FileCheck %s
 
-; In reality, check that the compilation succeeded and that some code was
-; generated.
+; Check that the compilation succeeded and that some code was generated.
 ; CHECK: vadd
 
+; Check that the loop is pipelined and that a valid node order is used.
+; CHECK-NOT: Number of node order issues found
+; CHECK: Number of loops software pipelined
+; CHECK-NOT: Number of node order issues found
+
 target triple = "hexagon"
 
 define void @fred(i16* noalias nocapture readonly %p0, i32 %p1, i32 %p2, i16* noalias nocapture %p3, i32 %p4) local_unnamed_addr #1 {
Index: test/CodeGen/Hexagon/swp-vmult.ll
===================================================================
--- test/CodeGen/Hexagon/swp-vmult.ll
+++ test/CodeGen/Hexagon/swp-vmult.ll
@@ -4,8 +4,8 @@
 ; Multiply and accumulate
 ; CHECK: mpyi([[REG0:r([0-9]+)]],[[REG1:r([0-9]+)]])
 ; CHECK-NEXT: add(r{{[0-9]+}},#4)
-; CHECK-NEXT: [[REG0]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
-; CHECK-NEXT: [[REG1]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+; CHECK-DAG: [[REG1]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+; CHECK-DAG: [[REG0]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
 ; CHECK-NEXT: endloop0
 
 define i32 @foo(i32* %a, i32* %b, i32 %n) {
Index: test/CodeGen/Hexagon/vect/vect-shuffle.ll
===================================================================
--- test/CodeGen/Hexagon/vect/vect-shuffle.ll
+++ test/CodeGen/Hexagon/vect/vect-shuffle.ll
@@ -1,8 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
+; REQUIRES: asserts
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr --stats -o - 2>&1 < %s | FileCheck %s
 
 ; Check that store is post-incremented.
 ; CHECK-NOT: extractu(r{{[0-9]+}},#32,
 ; CHECK-NOT: insert
+; CHECK-NOT: Number of node order issues found
+; CHECK: Number of loops software pipelined
+; CHECK-NOT: Number of node order issues found
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon"