Index: polly/trunk/include/polly/ScopInfo.h
===================================================================
--- polly/trunk/include/polly/ScopInfo.h
+++ polly/trunk/include/polly/ScopInfo.h
@@ -1474,17 +1474,62 @@
   /// the dimensionality of the underlying ScopArrayInfo object.
   void updateAccessDimensionality();
 
-  /// @brief Build Schedule for the SCoP region.
-  ///
+  /// @brief Construct the schedule of this SCoP.
   void buildSchedule();
 
-  /// @brief Build Schedule for the region @p RN.
-  ///
-  /// @param RN             The current region traversed.
-  /// @param LoopSchedules  Map from loops to their schedule and progress.
-  void buildSchedule(
-      RegionNode *RN,
-      DenseMap<Loop *, std::pair<isl_schedule *, unsigned>> &LoopSchedules);
+  /// @brief A loop stack element to keep track of per-loop information during
+  ///        schedule construction.
+  typedef struct LoopStackElement {
+    // The loop for which we keep information.
+    Loop *L;
+
+    // The (possibly incomplete) schedule for this loop.
+    isl_schedule *Schedule;
+
+    // The number of basic blocks in the current loop, for which a schedule has
+    // already been constructed.
+    unsigned NumBlocksProcessed;
+
+    LoopStackElement(Loop *L, __isl_give isl_schedule *S,
+                     unsigned NumBlocksProcessed)
+        : L(L), Schedule(S), NumBlocksProcessed(NumBlocksProcessed){};
+  } LoopStackElementTy;
+
+  /// @brief The loop stack used for schedule construction.
+  ///
+  /// The loop stack keeps track of schedule information for a set of nested
+  /// loops as well as an (optional) 'nullptr' loop that models the outermost
+  /// schedule dimension. The loops in a loop stack always have a parent-child
+  /// relation where the loop at position n is the parent of the loop at
+  /// position n + 1.
+  typedef SmallVector<LoopStackElementTy, 4> LoopStackTy;
+
+  /// @brief Construct schedule information for a given Region and add the
+  ///        derived information to @p LoopStack.
+  ///
+  /// Given a Region we derive schedule information for all RegionNodes
+  /// contained in this region ensuring that the assigned execution times
+  /// correctly model the existing control flow relations.
+  ///
+  /// @param R              The region which to process.
+  /// @param LoopStack      A stack of loops that are currently under
+  ///                       construction.
+  void buildSchedule(Region *R, LoopStackTy &LoopStack);
+
+  /// @brief Build Schedule for the region node @p RN and add the derived
+  ///        information to @p LoopStack.
+  ///
+  /// In case @p RN is a BasicBlock or a non-affine Region, we construct the
+  /// schedule for this @p RN and also finalize loop schedules in case the
+  /// current @p RN completes the loop.
+  ///
+  /// In case @p RN is a not-non-affine Region, we delegate the construction to
+  /// buildSchedule(Region *R, ...).
+  ///
+  /// @param RN             The RegionNode region traversed.
+  /// @param LoopStack      A stack of loops that are currently under
+  ///                       construction.
+  void buildSchedule(RegionNode *RN, LoopStackTy &LoopStack);
 
   /// @brief Collect all memory access relations of a given type.
   ///
Index: polly/trunk/lib/Analysis/ScopInfo.cpp
===================================================================
--- polly/trunk/lib/Analysis/ScopInfo.cpp
+++ polly/trunk/lib/Analysis/ScopInfo.cpp
@@ -3444,61 +3444,129 @@
 }
 
 void Scop::buildSchedule() {
-  DenseMap<Loop *, std::pair<isl_schedule *, unsigned>> LoopSchedules;
   Loop *L = getLoopSurroundingRegion(getRegion(), LI);
-  LoopSchedules[L];
-  buildSchedule(getRegion().getNode(), LoopSchedules);
-  Schedule = LoopSchedules[L].first;
+  LoopStackTy LoopStack({LoopStackElementTy(L, nullptr, 0)});
+  buildSchedule(getRegion().getNode(), LoopStack);
+  assert(LoopStack.size() == 1 && LoopStack.back().L == L);
+  Schedule = LoopStack[0].Schedule;
 }
 
-void Scop::buildSchedule(
-    RegionNode *RN,
-    DenseMap<Loop *, std::pair<isl_schedule *, unsigned>> &LoopSchedules) {
+/// To generate a schedule for the elements in a Region we traverse the Region
+/// in reverse-post-order and add the contained RegionNodes in traversal order
+/// to the schedule of the loop that is currently at the top of the LoopStack.
+/// For loop-free codes, this results in a correct sequential ordering.
+///
+/// Example:
+///           bb1(0)
+///         /     \.
+///      bb2(1)   bb3(2)
+///         \    /  \.
+///          bb4(3)  bb5(4)
+///             \   /
+///              bb6(5)
+///
+/// Including loops requires additional processing. Whenever a loop header is
+/// encountered, the corresponding loop is added to the @p LoopStack. Starting
+/// from an empty schedule, we first process all RegionNodes that are within
+/// this loop and complete the sequential schedule at this loop-level before
+/// processing about any other nodes. To implement this
+/// loop-nodes-first-processing, the reverse post-order traversal is
+/// insufficient. Hence, we additionally check if the traversal yields
+/// sub-regions or blocks that are outside the last loop on the @p LoopStack.
+/// These region-nodes are then queue and only traverse after the all nodes
+/// within the current loop have been processed.
+void Scop::buildSchedule(Region *R, LoopStackTy &LoopStack) {
+  Loop *OuterScopLoop = getLoopSurroundingRegion(getRegion(), LI);
+
+  ReversePostOrderTraversal<Region *> RTraversal(R);
+  std::deque<RegionNode *> WorkList(RTraversal.begin(), RTraversal.end());
+  std::deque<RegionNode *> DelayList;
+  bool LastRNWaiting = false;
+
+  // Iterate over the region @p R in reverse post-order but queue
+  // sub-regions/blocks iff they are not part of the last encountered but not
+  // completely traversed loop. The variable LastRNWaiting is a flag to indicate
+  // that we queued the last sub-region/block from the reverse post-order
+  // iterator. If it is set we have to explore the next sub-region/block from
+  // the iterator (if any) to guarantee progress. If it is not set we first try
+  // the next queued sub-region/blocks.
+  while (!WorkList.empty() || !DelayList.empty()) {
+    RegionNode *RN;
+
+    if ((LastRNWaiting && !WorkList.empty()) || DelayList.size() == 0) {
+      RN = WorkList.front();
+      WorkList.pop_front();
+      LastRNWaiting = false;
+    } else {
+      RN = DelayList.front();
+      DelayList.pop_front();
+    }
+
+    Loop *L = getRegionNodeLoop(RN, LI);
+    if (!getRegion().contains(L))
+      L = OuterScopLoop;
+
+    Loop *LastLoop = LoopStack.back().L;
+    if (LastLoop != L) {
+      if (!LastLoop->contains(L)) {
+        LastRNWaiting = true;
+        DelayList.push_back(RN);
+        continue;
+      }
+      LoopStack.push_back({L, nullptr, 0});
+    }
+    buildSchedule(RN, LoopStack);
+  }
+
+  return;
+}
+
+void Scop::buildSchedule(RegionNode *RN, LoopStackTy &LoopStack) {
 
   if (RN->isSubRegion()) {
     auto *LocalRegion = RN->getNodeAs<Region>();
     if (!SD.isNonAffineSubRegion(LocalRegion, &getRegion())) {
-      ReversePostOrderTraversal<Region *> RTraversal(LocalRegion);
-      for (auto *Child : RTraversal)
-        buildSchedule(Child, LoopSchedules);
+      buildSchedule(LocalRegion, LoopStack);
       return;
     }
   }
 
-  Loop *L = getRegionNodeLoop(RN, LI);
-  if (!getRegion().contains(L))
-    L = getLoopSurroundingRegion(getRegion(), LI);
-
-  int LD = getRelativeLoopDepth(L);
-  auto &LSchedulePair = LoopSchedules[L];
-  LSchedulePair.second += getNumBlocksInRegionNode(RN);
+  auto &LoopData = LoopStack.back();
+  LoopData.NumBlocksProcessed += getNumBlocksInRegionNode(RN);
 
   if (auto *Stmt = getStmtForRegionNode(RN)) {
     auto *UDomain = isl_union_set_from_set(Stmt->getDomain());
     auto *StmtSchedule = isl_schedule_from_domain(UDomain);
-    LSchedulePair.first = combineInSequence(LSchedulePair.first, StmtSchedule);
+    LoopData.Schedule = combineInSequence(LoopData.Schedule, StmtSchedule);
   }
 
-  isl_schedule *LSchedule = LSchedulePair.first;
-  unsigned NumVisited = LSchedulePair.second;
-  while (L && NumVisited == L->getNumBlocks()) {
-    auto *PL = L->getParentLoop();
-
-    auto &PSchedulePair = LoopSchedules[PL];
-
-    if (LSchedule) {
-      auto *LDomain = isl_schedule_get_domain(LSchedule);
-      auto *MUPA = mapToDimension(LDomain, LD + 1);
-      LSchedule = isl_schedule_insert_partial_schedule(LSchedule, MUPA);
-      PSchedulePair.first = combineInSequence(PSchedulePair.first, LSchedule);
+  // Check if we just processed the last node in this loop. If we did, finalize
+  // the loop by:
+  //
+  //   - adding new schedule dimensions
+  //   - folding the resulting schedule into the parent loop schedule
+  //   - dropping the loop schedule from the LoopStack.
+  //
+  // Then continue to check surrounding loops, which might also have been
+  // completed by this node.
+  while (LoopData.L &&
+         LoopData.NumBlocksProcessed == LoopData.L->getNumBlocks()) {
+    auto Schedule = LoopData.Schedule;
+    auto NumBlocksProcessed = LoopData.NumBlocksProcessed;
+
+    LoopStack.pop_back();
+    auto &NextLoopData = LoopStack.back();
+
+    if (Schedule) {
+      auto *Domain = isl_schedule_get_domain(Schedule);
+      auto *MUPA = mapToDimension(Domain, LoopStack.size());
+      Schedule = isl_schedule_insert_partial_schedule(Schedule, MUPA);
+      NextLoopData.Schedule =
+          combineInSequence(NextLoopData.Schedule, Schedule);
     }
 
-    PSchedulePair.second += NumVisited;
-
-    L = PL;
-    LD--;
-    NumVisited = PSchedulePair.second;
-    LSchedule = PSchedulePair.first;
+    NextLoopData.NumBlocksProcessed += NumBlocksProcessed;
+    LoopData = NextLoopData;
   }
 }
 
Index: polly/trunk/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
===================================================================
--- polly/trunk/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
+++ polly/trunk/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
@@ -0,0 +1,42 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s
+
+; CHECK: Stmt_loopA[i0] -> [0, 0, 0]
+; CHECK-DAG: Stmt_loopB[i0] -> [0, 0, 1]
+; CHECK-DAG: Stmt_bbB[] -> [1, 0, 0]
+; CHECK-DAG: Stmt_bbA[] -> [2, 0, 0]
+; CHECK-DAG: Stmt_bbMerge[]   -> [3, 0, 0]
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @hoge(i64 %p0, i64 %p1, i64 %p2, i64 %p3, float* %A) {
+entry:
+  br label %loopA
+
+loopA:
+  %tmp4 = phi i64 [ 0, %entry ], [ 0, %loopB]
+  store float 42.0, float* %A
+  %cmp0 = icmp sle i64 %p0, 100
+  br i1 %cmp0, label %loopB, label %bbB
+
+loopB:
+  store float 42.0, float* %A
+  %cmp1 = icmp sle i64 %p1, 100
+  br i1 %cmp1, label %loopA, label %bbA
+
+bbA:
+  store float 42.0, float* %A
+  %cmpbbA = icmp sle i64 %p2, 50
+  br i1 %cmpbbA, label %bbMerge, label %exit
+
+bbB:
+  store float 42.0, float* %A
+  %cmpbbB= icmp sle i64 %p3, 200
+  br i1 %cmpbbB, label %exit, label %bbMerge
+
+bbMerge:
+  store float 42.0, float* %A
+  br label %exit
+
+exit:
+  ret void
+}
Index: polly/trunk/test/ScopInfo/schedule-const-post-dominator-walk.ll
===================================================================
--- polly/trunk/test/ScopInfo/schedule-const-post-dominator-walk.ll
+++ polly/trunk/test/ScopInfo/schedule-const-post-dominator-walk.ll
@@ -0,0 +1,34 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s
+
+; CHECK: { Stmt_bb3[i0] -> [0, 0] };
+; CHECK: { Stmt_bb2[] -> [1, 0] };
+
+; Verify that we generate the correct schedule. In older versions of Polly,
+; we generated an incorrect schedule:
+;
+;   { Stmt_bb3[i0] -> [1, 0]; Stmt_bb2[] -> [0, 0] }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @hoge() {
+bb:
+  br label %bb3
+
+bb1:                                              ; preds = %bb5
+  br label %bb6
+
+bb2:                                              ; preds = %bb3
+  %tmp = phi i64 [ %tmp4, %bb3 ]
+  br label %bb6
+
+bb3:                                              ; preds = %bb5, %bb
+  %tmp4 = phi i64 [ 0, %bb ], [ 0, %bb5 ]
+  br i1 false, label %bb5, label %bb2
+
+bb5:                                              ; preds = %bb3
+  br i1 false, label %bb3, label %bb1
+
+bb6:                                              ; preds = %bb2, %bb1
+  %tmp2 = phi i64 [ %tmp, %bb2 ], [ undef, %bb1 ]
+  ret void
+}