Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -345,7 +345,7 @@
 
   void incIterCnt() { IterCnt++; }
   void resetIterCnt() { IterCnt = 0; }
-  int32_t getIterCnt() { return IterCnt; }
+  unsigned getIterCnt() { return IterCnt; }
 
   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
@@ -1205,7 +1205,7 @@
           }
           ScoreBracket->setRevisitLoop(true);
           LLVM_DEBUG(dbgs()
-                         << "set-revisit: Block"
+                         << "set-revisit2: Block"
                          << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
@@ -1639,10 +1639,9 @@
   }
 }
 
-/// Return true if the given basic block is a "bottom" block of a loop. This
-/// differs from MachineLoop::getBottomBlock in that it works even if the loop
-/// is discontiguous. This also handles multiple back-edges for the same
-/// "header" block of a loop.
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
                                     const MachineBasicBlock *Block) {
   for (MachineBasicBlock *MBB : Loop->blocks()) {
@@ -1776,11 +1775,12 @@
     LLVM_DEBUG(dbgs() << '\n';);
 
     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement and doesn't always guarantee convergence for a loop. Each
-    // loop should take at most 2 iterations for it to converge naturally.
-    // When this max is reached and result doesn't converge, we force
-    // convergence by inserting a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > 2) {
+    // placement, but doesn't guarantee convergence for a loop. Each
+    // loop should take at most (n+1) iterations for it to converge naturally,
+    // where n is the number of bottom blocks. If this threshold is reached and
+    // the result hasn't converged, then we force convergence by inserting
+    // a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
       // To ensure convergence, need to make wait events at loop footer be no
       // more than those from the previous iteration.
       // As a simplification, instead of tracking individual scores and
@@ -1792,16 +1792,16 @@
         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
           HasPending = true;
+          break;
         }
       }
 
       if (HasPending) {
         if (!SWaitInst) {
-          SWaitInst = Block.getParent()->CreateMachineInstr(
-              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
+          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                              .addImm(0);
           TrackedWaitcntSet.insert(SWaitInst);
-          const MachineOperand &Op = MachineOperand::CreateImm(0);
-          SWaitInst->addOperand(MF, Op);
 #if 0 // TODO: Format the debug output
           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
           OutputTransformAdd(SWaitInst, context);
@@ -1898,7 +1898,7 @@
       if ((std::count(BlockWaitcntProcessedSet.begin(),
                       BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-        LLVM_DEBUG(dbgs() << "set-revisit: Block"
+        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
                           << ContainingLoop->getHeader()->getNumber() << '\n';);
       }
     }
@@ -1906,7 +1906,7 @@
     // Walk over the instructions.
     insertWaitcntInBlock(MF, MBB);
 
-    // Flag that waitcnts have been processed at least once.
+    // Record that waitcnts have been processed at least once for this block.
     BlockWaitcntProcessedSet.push_back(&MBB);
 
     // See if we want to revisit the loop. If a loop has multiple back-edges,
@@ -2004,8 +2004,12 @@
     // TODO: Could insert earlier and schedule more liberally with operations
     // that only use caller preserved registers.
     MachineBasicBlock &EntryBB = MF.front();
-    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-      .addImm(0);
+    auto SWaitInst = BuildMI(EntryBB, EntryBB.getFirstNonPHI(),
+                             DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                             .addImm(0);
+
+    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+               << "New Instr: " << *SWaitInst << '\n');
 
     Modified = true;
   }
Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
@@ -57,3 +57,37 @@
     EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec
     S_ENDPGM
 ...
+---
+
+# GCN-LABEL: name: waitcnt-multiple-back-edges{{$}}
+# GCN: bb.0:
+# GCN: S_WAITCNT 0
+# GCN-NEXT: S_BRANCH %bb.2
+
+name: waitcnt-multiple-back-edges
+body: |
+  bb.0:
+    S_BRANCH %bb.2
+
+  bb.1:
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+
+  bb.3:
+    S_CBRANCH_VCCNZ %bb.5, implicit $vcc
+
+  bb.4:
+    BUFFER_ATOMIC_ADD_OFFSET renamable $vgpr0, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 0, 4, 0, implicit $exec
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.6
+
+  bb.5:
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.6
+
+  bb.6:
+    S_CBRANCH_SCC1 %bb.0, implicit $scc
+    S_ENDPGM
+...