Index: lib/Transforms/Scalar/StructurizeCFG.cpp
===================================================================
--- lib/Transforms/Scalar/StructurizeCFG.cpp
+++ lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -204,6 +204,9 @@
 
   void orderNodes();
 
+  Loop *getAdjustedLoop(RegionNode *RN);
+  unsigned getAdjustedLoopDepth(RegionNode *RN);
+
   void analyzeLoops(RegionNode *N);
 
   Value *invert(Value *Condition);
@@ -301,6 +304,26 @@
   return false;
 }
 
+/// Use the exit block to determine the loop if RN is a SubRegion.
+Loop *StructurizeCFG::getAdjustedLoop(RegionNode *RN) {
+  if (RN->isSubRegion()) {
+    Region *SubRegion = RN->getNodeAs<Region>();
+    return LI->getLoopFor(SubRegion->getExit());
+  }
+
+  return LI->getLoopFor(RN->getEntry());
+}
+
+/// Use the exit block to determine the loop depth if RN is a SubRegion.
+unsigned StructurizeCFG::getAdjustedLoopDepth(RegionNode *RN) {
+  if (RN->isSubRegion()) {
+    Region *SubR = RN->getNodeAs<Region>();
+    return LI->getLoopDepth(SubR->getExit());
+  }
+
+  return LI->getLoopDepth(RN->getEntry());
+}
+
 /// Build up the general order of nodes
 void StructurizeCFG::orderNodes() {
   ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
@@ -310,16 +333,15 @@
   // to what we want.  The only problem with it is that sometimes backedges
   // for outer loops will be visited before backedges for inner loops.
   for (RegionNode *RN : RPOT) {
-    BasicBlock *BB = RN->getEntry();
-    Loop *Loop = LI->getLoopFor(BB);
+    Loop *Loop = getAdjustedLoop(RN);
     ++LoopBlocks[Loop];
   }
 
   unsigned CurrentLoopDepth = 0;
   Loop *CurrentLoop = nullptr;
   for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    BasicBlock *BB = (*I)->getEntry();
-    unsigned LoopDepth = LI->getLoopDepth(BB);
+    RegionNode *RN = cast<RegionNode>(*I);
+    unsigned LoopDepth = getAdjustedLoopDepth(RN);
 
     if (is_contained(Order, *I))
       continue;
@@ -331,15 +353,14 @@
       auto LoopI = I;
       while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
         LoopI++;
-        BasicBlock *LoopBB = (*LoopI)->getEntry();
-        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+        if (getAdjustedLoop(cast<RegionNode>(*LoopI)) == CurrentLoop) {
           --BlockCount;
           Order.push_back(*LoopI);
         }
       }
     }
 
-    CurrentLoop = LI->getLoopFor(BB);
+    CurrentLoop = getAdjustedLoop(RN);
     if (CurrentLoop)
       LoopBlocks[CurrentLoop]--;
 
Index: test/CodeGen/AMDGPU/nested-loop-conditions.ll
===================================================================
--- test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -123,50 +123,52 @@
 ; Earlier version of above, before a run of the structurizer.
 ; IR-LABEL: @nested_loop_conditions(
 
-; IR: Flow7:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
+; IR: Flow3:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %21)
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
 ; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
 ; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
-; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
+; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
 
-; IR: Flow1:
-; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
-; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
-; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
-; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
-; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
-; IR-NEXT: br i1 %18, label %Flow7, label %bb14
+; IR: Flow4:
+; IR-NEXT: %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %2)
+; IR-NEXT: br label %Flow
 
-; IR: Flow2:
-; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
-; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
-; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
-; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
-; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
-; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
-; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
-; IR-NEXT: br i1 %25, label %bb21, label %Flow3
+; IR: Flow:
+; IR-NEXT:  %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
+; IR-NEXT:  %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
+; IR-NEXT:  %6 = extractvalue { i1, i64 } %5, 0
+; IR-NEXT:  %7 = extractvalue { i1, i64 } %5, 1
+; IR-NEXT:  br i1 %6, label %bb13, label %bb31
+
+; IR: bb14:
+; IR: %tmp15 = icmp eq i32 %tmp1037, 1
+; IR-NEXT: %8 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp15)
+
+; IR: Flow1:
+; IR-NEXT: %loop.phi = phi i64 [ %18, %bb21 ], [ %phi.broken, %bb14 ]
+; IR-NEXT: %11 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %bb14 ]
+; IR-NEXT: %12 = phi i32 [ %tmp10, %bb21 ], [ undef, %bb14 ]
+; IR-NEXT: %13 = phi i1 [ %17, %bb21 ], [ false, %bb14 ]
+; IR-NEXT: %14 = phi i1 [ false, %bb21 ], [ true, %bb14 ]
+; IR-NEXT: %15 = call i64 @llvm.amdgcn.else.break(i64 %10, i64 %loop.phi)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %10)
+; IR-NEXT: %16 = call i1 @llvm.amdgcn.loop(i64 %15)
+; IR-NEXT: br i1 %16, label %Flow2, label %bb14
 
 ; IR: bb21:
 ; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %27 = xor i1 %tmp12, true
-; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
-; IR-NEXT: br label %Flow3
+; IR-NEXT: %17 = xor i1 %tmp12, true
+; IR-NEXT: %18 = call i64 @llvm.amdgcn.if.break(i1 %17, i64 %phi.broken)
+; IR-NEXT: br label %Flow1
 
-; IR: Flow3:
-; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
-; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
-; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
-; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
-; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
-; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
+; IR: Flow2:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR-NEXT: %19 = call { i1, i64 } @llvm.amdgcn.if(i1 %14)
+; IR-NEXT: %20 = extractvalue { i1, i64 } %19, 0
+; IR-NEXT: %21 = extractvalue { i1, i64 } %19, 1
+; IR-NEXT: br i1 %20, label %bb31.loopexit, label %Flow3
 
 ; IR: bb31:
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
Index: test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
===================================================================
--- /dev/null
+++ test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
@@ -0,0 +1,182 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg %s | FileCheck %s
+;
+; StructurizeCFG::orderNodes basically uses a reverse post-order (RPO) traversal of the region
+; list to get the order. The only problem with it is that sometimes backedges
+; for outer loops will be visited before backedges for inner loops. To solve this problem,
+; a loop depth based approach has been used to make sure all blocks in this loop has been visited
+; before moving on to outer loop.
+;
+; However, we found a problem for a SubRegion which is a loop itself:
+;                   _
+;                  | |
+;                  V |
+;      --> BB1 --> BB2 --> BB3 -->
+;
+; In this case, BB2 is a SubRegion (loop), and thus its loopdepth is different than that of
+; BB1 and BB3. This fact will lead BB2 to be placed in the wrong order.
+;
+; In this work, we treat the SubRegion as a special case and use its exit block to determine
+; the loop and its depth to guard the sorting.
+define amdgpu_kernel void @loop_subregion_misordered(i32 addrspace(1)* %arg0) #0 {
+; CHECK-LABEL: @loop_subregion_misordered(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+; CHECK-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG0:%.*]], i32 [[TID]]
+; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+
+; CHECK:       LOOP.HEADER:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[NUM7:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 1
+; CHECK-NEXT:    [[NUM0:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT:    br i1 [[NUM0]], label [[BB62:%.*]], label [[FLOW:%.*]]
+
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[NUM1:%.*]] = phi i32 [ [[INC_I:%.*]], [[INCREMENT_I:%.*]] ], [ undef, [[BB62]] ]
+; CHECK-NEXT:    [[NUM2:%.*]] = phi i1 [ false, [[INCREMENT_I]] ], [ true, [[BB62]] ]
+; CHECK-NEXT:    [[NUM3:%.*]] = phi i1 [ true, [[INCREMENT_I]] ], [ false, [[BB62]] ]
+; CHECK-NEXT:    br label [[FLOW]]
+
+; CHECK:       bb18:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP24:%.*]] = urem i32 [[TMP22]], 52
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
+; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
+
+; CHECK:       Flow2:
+; CHECK-NEXT:    [[NUM4:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[NUM9:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[NUM5:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[NUM11:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    br i1 [[NUM5]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3:%.*]]
+
+; CHECK:       INNER_LOOP:
+; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    [[INNER_LOOP_J_INC]] = add nsw i32 [[INNER_LOOP_J]], 1
+; CHECK-NEXT:    [[INNER_LOOP_CMP:%.*]] = icmp eq i32 [[INNER_LOOP_J]], 0
+; CHECK-NEXT:    br i1 [[INNER_LOOP_CMP]], label [[INNER_LOOP_BREAK]], label [[INNER_LOOP]]
+
+; CHECK:       INNER_LOOP_BREAK:
+; CHECK-NEXT:    [[TMP59]] = extractelement <4 x i32> [[TMP14]], i64 2
+; CHECK-NEXT:    call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+; CHECK-NEXT:    br label [[FLOW2:%.*]]
+
+; CHECK:       bb62:
+; CHECK-NEXT:    [[LOAD13:%.*]] = icmp ult i32 [[TMP16]], 271
+; CHECK-NEXT:    [[NUM6:%.*]] = xor i1 [[LOAD13]], true
+; CHECK-NEXT:    br i1 [[NUM6]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
+
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[NUM7]] = phi i32 [ [[I_FINAL:%.*]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
+; CHECK-NEXT:    [[NUM8:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br i1 [[NUM8]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
+
+; CHECK:       Flow4:
+; CHECK-NEXT:  br i1 [[NUM10:%.*]], label %bb64, label [[RETURN:%.*]]
+
+; CHECK:       bb64:
+; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #0
+; CHECK-NEXT:    br label [[RETURN]]
+
+; CHECK:       Flow:
+; CHECK-NEXT:    [[NUM9]] = phi i32 [ [[NUM1]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[NUM10]] = phi i1 [ [[NUM2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[NUM11]] = phi i1 [ [[NUM3]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[NUM12:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[NUM12]], label [[BB18]], label [[FLOW2]]
+
+; CHECK:       INCREMENT_I:
+; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336
+; CHECK-NEXT:    br label [[FLOW1]]
+
+; CHECK:       END_ELSE_BLOCK:
+; CHECK-NEXT:    [[I_FINAL]] = phi i32 [ [[NUM4]], [[FLOW2]] ]
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1337
+; CHECK-NEXT:    [[CMP_END_ELSE_BLOCK]] = icmp eq i32 [[I_FINAL]], -1
+; CHECK-NEXT:    br label [[FLOW3]]
+
+; CHECK:       RETURN:
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x99
+; CHECK-NEXT:    store volatile <2 x float> [[LOAD1]], <2 x float> addrspace(1)* undef, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+  %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
+  %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
+  br label %LOOP.HEADER
+
+LOOP.HEADER:
+  %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
+  call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
+  %tmp12 = zext i32 %i to i64
+  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
+  %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
+  %tmp15 = extractelement <4 x i32> %tmp14, i64 0
+  %tmp16 = and i32 %tmp15, 65535
+  %tmp17 = icmp eq i32 %tmp16, 1
+  br i1 %tmp17, label %bb18, label %bb62
+
+bb18:
+  %tmp19 = extractelement <2 x i32> %tmp, i64 0
+  %tmp22 = lshr i32 %tmp19, 16
+  %tmp24 = urem i32 %tmp22, 52
+  %tmp25 = mul nuw nsw i32 %tmp24, 52
+  br label %INNER_LOOP
+
+INNER_LOOP:
+  %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
+  call void asm sideeffect "; inner loop body", ""() #0
+  %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
+  %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
+  br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
+
+INNER_LOOP_BREAK:
+  %tmp59 = extractelement <4 x i32> %tmp14, i64 2
+  call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+  br label %END_ELSE_BLOCK
+
+bb62:
+  %load13 = icmp ult i32 %tmp16, 271
+  br i1 %load13, label %bb64, label %INCREMENT_I
+
+bb64:
+  call void asm sideeffect "s_nop 42", "~{memory}"() #0
+  br label %RETURN
+
+INCREMENT_I:
+  %inc.i = add i32 %i, 1
+  call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
+  br label %END_ELSE_BLOCK
+
+END_ELSE_BLOCK:
+  %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
+  call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
+  %cmp.end.else.block = icmp eq i32 %i.final, -1
+  br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
+
+RETURN:
+  call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
+  store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
+  ret void
+}
+
+; The same function, except break to return block goes directly to the
+; return, which managed to hide the bug.
+; FIXME: Merge variant from backedge-id-bug-xfail
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone }