Index: lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
===================================================================
--- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -264,8 +264,17 @@
   Term->setCondition(BoolTrue);
   Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
 
-  for (BasicBlock *Pred : predecessors(Target))
-    Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
+  for (BasicBlock *Pred : predecessors(Target)) {
+    Value *PHIValue = Int64Zero;
+    if (Pred == BB) // Remember the value of the previous iteration.
+      PHIValue = Arg;
+    // If the backedge from Pred to Target could be executed before the exit
+    // of the loop at BB, it should not reset or change "Broken", which keeps
+    // track of the number of threads exited the loop at BB.
+    else if (L->contains(Pred) && DT->dominates(Pred, BB))
+      PHIValue = Broken;
+    Broken->addIncoming(PHIValue, Pred);
+  }
 
   Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
Index: test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll
@@ -0,0 +1,49 @@
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
+
+
+; OPT-LABEL: @multiple_backedges(
+; OPT: loop:                                             ; preds = %loop_end, %loop, %entry
+; OPT-NEXT:   %phi.broken1 = phi i64 [ %2, %loop_end ], [ %phi.broken1, %loop ], [ 0, %entry ]
+; OPT-NEXT:   %phi.broken = phi i64 [ 0, %loop_end ], [ %0, %loop ], [ 0, %entry ]
+
+
+; OPT:        %tmp6 = icmp slt i32 %arg, %tmp5
+; OPT-NEXT:   %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp6, i64 %phi.broken)
+; OPT-NEXT:   %1 = call i1 @llvm.amdgcn.loop(i64 %0)
+; OPT-NEXT:   br i1 %1, label %loop_end, label %loop
+
+; OPT: loop_end:                                         ; preds = %loop
+; OPT-NEXT:   call void @llvm.amdgcn.end.cf(i64 %0)
+; OPT-NEXT:  %exit = icmp sgt i32 %tmp5, %tmp2
+; OPT-NEXT:  %2 = call i64 @llvm.amdgcn.if.break(i1 %exit, i64 %phi.broken1)
+; OPT-NEXT:  %3 = call i1 @llvm.amdgcn.loop(i64 %2)
+; OPT-NEXT:  br i1 %3, label %loop_exit, label %loop
+
+; OPT: loop_exit:                                        ; preds = %loop_end
+; OPT-NEXT:  call void @llvm.amdgcn.end.cf(i64 %2)
+define amdgpu_kernel void @multiple_backedges(i32 %arg, i32* %arg1) {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = shl nsw i32 %arg, 1
+  br label %loop
+
+loop:
+  %tmp4 = phi i32 [ 0, %entry ], [ %tmp5, %loop ], [ 0, %loop_end ]
+  %tmp5 = add nsw i32 %tmp4, %tmp
+  %tmp6 = icmp slt i32 %arg, %tmp5
+  br i1 %tmp6, label %loop_end, label %loop
+
+loop_end:
+  %exit = icmp sgt i32 %tmp5, %tmp2
+  br i1 %exit, label %loop_exit, label %loop
+
+loop_exit:
+  %tmp12 = zext i32 %tmp to i64
+  %tmp13 = getelementptr inbounds i32, i32* %arg1, i64 %tmp12
+  %tmp14 = addrspacecast i32* %tmp13 to i32 addrspace(1)*
+  store i32 %tmp5, i32 addrspace(1)* %tmp14, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workitem.id.x()