Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -41,7 +41,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -178,22 +177,21 @@
   }
 }
 
-// This is a per-basic-block object that maintains current score brackets
-// of each wait counter, and a per-register scoreboard for each wait counter.
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
 // We also maintain the latest score for every event type that can change the
 // waitcnt in order to know if there are multiple types of events within
 // the brackets. When multiple types of event happen in the bracket,
 // wait count may get decreased out of order, therefore we need to put in
 // "s_waitcnt 0" before use.
-class BlockWaitcntBrackets {
+class WaitcntBrackets {
 public:
-  BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
+  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
     for (auto T : inst_counter_types())
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
   }
 
-  ~BlockWaitcntBrackets() = default;
-
   static uint32_t getWaitCountMax(InstCounterType T) {
     switch (T) {
     case VM_CNT:
@@ -208,25 +206,6 @@
     return 0;
   }
 
-  void setScoreLB(InstCounterType T, uint32_t Val) {
-    assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return;
-    ScoreLBs[T] = Val;
-  }
-
-  void setScoreUB(InstCounterType T, uint32_t Val) {
-    assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return;
-    ScoreUBs[T] = Val;
-    if (T == EXP_CNT) {
-      uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
-      if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
-        ScoreLBs[T] = UB;
-    }
-  }
-
   uint32_t getScoreLB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
     if (T >= NUM_INST_CNTS)
@@ -251,21 +230,6 @@
     return EXP_CNT;
   }
 
-  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
-    if (GprNo < NUM_ALL_VGPRS) {
-      if (GprNo > VgprUB) {
-        VgprUB = GprNo;
-      }
-      VgprScores[T][GprNo] = Val;
-    } else {
-      assert(T == LGKM_CNT);
-      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
-        SgprUB = GprNo - NUM_ALL_VGPRS;
-      }
-      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
-    }
-  }
-
   uint32_t getRegScore(int GprNo, InstCounterType T) {
     if (GprNo < NUM_ALL_VGPRS) {
       return VgprScores[T][GprNo];
@@ -284,15 +248,13 @@
     memset(SgprScores, 0, sizeof(SgprScores));
   }
 
+  bool merge(const WaitcntBrackets &Other);
+
   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
                              const MachineRegisterInfo *MRI,
                              const SIRegisterInfo *TRI, unsigned OpNo,
                              bool Def) const;
 
-  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
-                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
-                   unsigned OpNo, uint32_t Val);
-
   int32_t getMaxVGPR() const { return VgprUB; }
   int32_t getMaxSGPR() const { return SgprUB; }
 
@@ -307,12 +269,11 @@
                      const MachineRegisterInfo *MRI, WaitEventType E,
                      MachineInstr &MI);
 
+  bool hasPending() const { return PendingEvents != 0; }
   bool hasPendingEvent(WaitEventType E) const {
     return PendingEvents & (1 << E);
   }
 
-  void mergePendingEvents(const BlockWaitcntBrackets &Other);
-
   bool hasPendingFlat() const {
     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
@@ -325,23 +286,58 @@
     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
   }
 
-  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
+  void print(raw_ostream &);
+  void dump() { print(dbgs()); }
 
-  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
+private:
+  struct MergeInfo {
+    uint32_t OldLB;
+    uint32_t OtherLB;
+    uint32_t MyShift;
+    uint32_t OtherShift;
+  };
+  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
+                         uint32_t OtherScore);
 
-  bool getRevisitLoop() const { return RevisitLoop; }
-  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
+  void setScoreLB(InstCounterType T, uint32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreLBs[T] = Val;
+  }
 
-  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
-  int32_t getPostOrder() const { return PostOrder; }
+  void setScoreUB(InstCounterType T, uint32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreUBs[T] = Val;
+    if (T == EXP_CNT) {
+      uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
+      if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
+        ScoreLBs[T] = UB;
+    }
+  }
 
-  void print(raw_ostream &);
-  void dump() { print(dbgs()); }
+  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      if (GprNo > VgprUB) {
+        VgprUB = GprNo;
+      }
+      VgprScores[T][GprNo] = Val;
+    } else {
+      assert(T == LGKM_CNT);
+      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
+        SgprUB = GprNo - NUM_ALL_VGPRS;
+      }
+      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+    }
+  }
+
+  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+                   unsigned OpNo, uint32_t Val);
 
-private:
   const GCNSubtarget *ST = nullptr;
-  bool RevisitLoop = false;
-  int32_t PostOrder = 0;
   uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
   uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
   uint32_t PendingEvents = 0;
@@ -357,53 +353,27 @@
   uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
 };
 
-// This is a per-loop-region object that records waitcnt status at the end of
-// loop footer from the previous iteration. We also maintain an iteration
-// count to track the number of times the loop has been visited. When it
-// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
-// at the end of the loop footer.
-class LoopWaitcntData {
-public:
-  LoopWaitcntData() = default;
-  ~LoopWaitcntData() = default;
-
-  void incIterCnt() { IterCnt++; }
-  void resetIterCnt() { IterCnt = 0; }
-  unsigned getIterCnt() { return IterCnt; }
-
-  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
-  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
-
-  void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
-
-private:
-  // s_waitcnt added at the end of loop footer to stablize wait scores
-  // at the end of the loop footer.
-  MachineInstr *LfWaitcnt = nullptr;
-  // Number of iterations the loop has been visited, not including the initial
-  // walk over.
-  int32_t IterCnt = 0;
-};
-
 class SIInsertWaitcnts : public MachineFunctionPass {
 private:
   const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
-  const MachineLoopInfo *MLI = nullptr;
   AMDGPU::IsaVersion IV;
 
-  DenseSet<MachineBasicBlock *> BlockVisitedSet;
   DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseSet<MachineInstr *> VCCZBugHandledSet;
 
-  DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
-      BlockWaitcntBracketsMap;
+  struct BlockInfo {
+    MachineBasicBlock *MBB;
+    std::unique_ptr<WaitcntBrackets> Incoming;
+    bool Dirty = true;
 
-  std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
+    explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
+  };
 
-  DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
+  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
+  DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
 
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
   // because of amdgpu-waitcnt-forcezero flag
@@ -427,7 +397,6 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -466,26 +435,22 @@
   }
 
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
-  void generateWaitcntInstBefore(MachineInstr &MI,
-                                 BlockWaitcntBrackets *ScoreBrackets,
+  bool generateWaitcntInstBefore(MachineInstr &MI,
+                                 WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr);
   void updateEventWaitcntAfter(MachineInstr &Inst,
-                               BlockWaitcntBrackets *ScoreBrackets);
-  void mergeInputScoreBrackets(MachineBasicBlock &Block);
-  bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
-  unsigned countNumBottomBlocks(const MachineLoop *Loop);
-  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
-  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+                               WaitcntBrackets *ScoreBrackets);
+  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
+                            WaitcntBrackets &ScoreBrackets);
 };
 
 } // end anonymous namespace
 
-RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
-                                                 const SIInstrInfo *TII,
-                                                 const MachineRegisterInfo *MRI,
-                                                 const SIRegisterInfo *TRI,
-                                                 unsigned OpNo,
-                                                 bool Def) const {
+RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
+                                            const SIInstrInfo *TII,
+                                            const MachineRegisterInfo *MRI,
+                                            const SIRegisterInfo *TRI,
+                                            unsigned OpNo, bool Def) const {
   const MachineOperand &Op = MI->getOperand(OpNo);
   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
       (Def && !Op.isDef()))
@@ -523,11 +488,11 @@
   return Result;
 }
 
-void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
-                                       const SIInstrInfo *TII,
-                                       const SIRegisterInfo *TRI,
-                                       const MachineRegisterInfo *MRI,
-                                       unsigned OpNo, uint32_t Val) {
+void WaitcntBrackets::setExpScore(const MachineInstr *MI,
+                                  const SIInstrInfo *TII,
+                                  const SIRegisterInfo *TRI,
+                                  const MachineRegisterInfo *MRI, unsigned OpNo,
+                                  uint32_t Val) {
   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
   LLVM_DEBUG({
     const MachineOperand &Opnd = MI->getOperand(OpNo);
@@ -538,10 +503,10 @@
   }
 }
 
-void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
-                                         const SIRegisterInfo *TRI,
-                                         const MachineRegisterInfo *MRI,
-                                         WaitEventType E, MachineInstr &Inst) {
+void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+                                    const SIRegisterInfo *TRI,
+                                    const MachineRegisterInfo *MRI,
+                                    WaitEventType E, MachineInstr &Inst) {
   const MachineRegisterInfo &MRIA = *MRI;
   InstCounterType T = eventCounter(E);
   uint32_t CurrScore = getScoreUB(T) + 1;
@@ -682,7 +647,7 @@
   }
 }
 
-void BlockWaitcntBrackets::print(raw_ostream &OS) {
+void WaitcntBrackets::print(raw_ostream &OS) {
   OS << '\n';
   for (auto T : inst_counter_types()) {
     uint32_t LB = getScoreLB(T);
@@ -734,14 +699,14 @@
 
 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
 /// whether a waitcnt instruction is needed at all.
-bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
   return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
          simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
          simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
 }
 
-bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T,
-                                           unsigned &Count) const {
+bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+                                      unsigned &Count) const {
   const uint32_t LB = getScoreLB(T);
   const uint32_t UB = getScoreUB(T);
   if (Count < UB && UB - Count > LB)
@@ -751,9 +716,8 @@
   return false;
 }
 
-void BlockWaitcntBrackets::determineWait(InstCounterType T,
-                                         uint32_t ScoreToWait,
-                                         AMDGPU::Waitcnt &Wait) const {
+void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
+                                    AMDGPU::Waitcnt &Wait) const {
   // If the score of src_operand falls within the bracket, we need an
   // s_waitcnt instruction.
   const uint32_t LB = getScoreLB(T);
@@ -777,13 +741,13 @@
   }
 }
 
-void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
+void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
   applyWaitcnt(VM_CNT, Wait.VmCnt);
   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
   applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
 }
 
-void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
+void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
   const uint32_t UB = getScoreUB(T);
   if (Count >= UB)
     return;
@@ -798,19 +762,9 @@
   }
 }
 
-void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) {
-  for (auto T : inst_counter_types()) {
-    uint32_t Old = PendingEvents & WaitEventMaskForInst[T];
-    uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T];
-    if (Other.MixedPendingEvents[T] || (Old && New && Old != New))
-      MixedPendingEvents[T] = true;
-    PendingEvents |= New;
-  }
-}
-
 // Where there are multiple types of event in the bracket of a counter,
 // the decrement may go out of order.
-bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
+bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   // Scalar memory read always can go out of order.
   if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
     return true;
@@ -846,22 +800,22 @@
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-void SIInsertWaitcnts::generateWaitcntInstBefore(
-    MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets,
+bool SIInsertWaitcnts::generateWaitcntInstBefore(
+    MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
     MachineInstr *OldWaitcntInstr) {
   setForceEmitWaitcnt();
   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
 
   if (MI.isDebugInstr())
-    return;
+    return false;
 
   AMDGPU::Waitcnt Wait;
 
   // See if this instruction has a forced S_WAITCNT VM.
   // TODO: Handle other cases of NeedsWaitcntVmBefore()
   if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
-           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
-           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
+      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
     Wait.VmCnt = 0;
   }
 
@@ -941,10 +895,10 @@
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
-      if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) ||
-          ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) ||
-          ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) ||
-          ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) {
+      if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
+          ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
+          ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
+          ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
         Wait.ExpCnt = 0;
       }
     }
@@ -978,23 +932,23 @@
         continue;
       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
       // VM_CNT is only relevant to vgpr or LDS.
-      ScoreBrackets->determineWait(
-          VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
+      ScoreBrackets.determineWait(
+          VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
     }
 
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
       const MachineOperand &Op = MI.getOperand(I);
       const MachineRegisterInfo &MRIA = *MRI;
       RegInterval Interval =
-          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
+          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Op.getReg())) {
           // VM_CNT is only relevant to vgpr or LDS.
-          ScoreBrackets->determineWait(
-              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
+          ScoreBrackets.determineWait(
+              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
         }
-        ScoreBrackets->determineWait(
-            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
+        ScoreBrackets.determineWait(
+            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
     }
     // End of for loop that looks at all source operands to decide vm_wait_cnt
@@ -1012,26 +966,26 @@
         if (AS != AMDGPUAS::LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        ScoreBrackets->determineWait(
-            VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
-        ScoreBrackets->determineWait(
-            EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
+        ScoreBrackets.determineWait(
+            VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+        ScoreBrackets.determineWait(
+            EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
       }
     }
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
       MachineOperand &Def = MI.getOperand(I);
       const MachineRegisterInfo &MRIA = *MRI;
       RegInterval Interval =
-          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
+          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         if (TRI->isVGPR(MRIA, Def.getReg())) {
-          ScoreBrackets->determineWait(
-              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
-          ScoreBrackets->determineWait(
-              EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
+          ScoreBrackets.determineWait(
+              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+          ScoreBrackets.determineWait(
+              EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
         }
-        ScoreBrackets->determineWait(
-            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
+        ScoreBrackets.determineWait(
+            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
     } // End of for loop that looks at all dest operands.
   }
@@ -1049,25 +1003,28 @@
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
-    if (ScoreBrackets->getScoreLB(LGKM_CNT) <
-            ScoreBrackets->getScoreUB(LGKM_CNT) &&
-        ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
+    if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+            ScoreBrackets.getScoreUB(LGKM_CNT) &&
+        ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
       Wait.LgkmCnt = 0;
     }
   }
 
   // Early-out if no wait is indicated.
-  if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
+  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
+    bool Modified = false;
     if (OldWaitcntInstr) {
       if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
         TrackedWaitcntSet.erase(OldWaitcntInstr);
         OldWaitcntInstr->eraseFromParent();
+        Modified = true;
       } else {
         int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
-        ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+        ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
       }
+      Modified = true;
     }
-    return;
+    return Modified;
   }
 
   if (ForceEmitZeroWaitcnts)
@@ -1080,7 +1037,7 @@
   if (ForceEmitWaitcnt[LGKM_CNT])
     Wait.LgkmCnt = 0;
 
-  ScoreBrackets->applyWaitcnt(Wait);
+  ScoreBrackets.applyWaitcnt(Wait);
 
   AMDGPU::Waitcnt OldWait;
   if (OldWaitcntInstr) {
@@ -1088,22 +1045,7 @@
         AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
   }
   if (OldWait.dominates(Wait))
-    return;
-
-  MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
-  if (ContainingLoop) {
-    MachineBasicBlock *TBB = ContainingLoop->getHeader();
-    BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
-    if (!ScoreBracket) {
-      assert(!BlockVisitedSet.count(TBB));
-      BlockWaitcntBracketsMap[TBB] =
-          llvm::make_unique<BlockWaitcntBrackets>(ST);
-      ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
-    }
-    ScoreBracket->setRevisitLoop(true);
-    LLVM_DEBUG(dbgs() << "set-revisit2: Block"
-                      << ContainingLoop->getHeader()->getNumber() << '\n';);
-  }
+    return false;
 
   if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
     Wait = Wait.combined(OldWait);
@@ -1125,22 +1067,8 @@
                       << "Old Instr: " << MI << '\n'
                       << "New Instr: " << *SWaitInst << '\n');
   }
-}
-
-void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
-                                             MachineInstr *Waitcnt) {
-  if (MBB.empty()) {
-    MBB.push_back(Waitcnt);
-    return;
-  }
 
-  MachineBasicBlock::iterator It = MBB.end();
-  MachineInstr *MI = &*(--It);
-  if (MI->isBranch()) {
-    MBB.insert(It, Waitcnt);
-  } else {
-    MBB.push_back(Waitcnt);
-  }
+  return true;
 }
 
 // This is a flat memory operation. Check to see if it has memory
@@ -1158,8 +1086,8 @@
   return false;
 }
 
-void SIInsertWaitcnts::updateEventWaitcntAfter(
-    MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
+void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
+                                               WaitcntBrackets *ScoreBrackets) {
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
   // bracket and the destination operand scores.
@@ -1225,142 +1153,81 @@
   }
 }
 
-// Merge the score brackets of the Block's predecessors;
-// this merged score bracket is used when adding waitcnts to the Block
-void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
-  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
-  uint32_t MaxPending[NUM_INST_CNTS] = {0};
-  uint32_t MaxFlat[NUM_INST_CNTS] = {0};
-
-  // For single basic block loops, we need to retain the Block's
-  // score bracket to have accurate Pred info. So, make a copy of Block's
-  // score bracket, clear() it (which retains several important bits of info),
-  // populate, and then replace en masse. For non-single basic block loops,
-  // just clear Block's current score bracket and repopulate in-place.
-  bool IsSelfPred;
-  std::unique_ptr<BlockWaitcntBrackets> S;
-
-  IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
-    != Block.pred_end();
-  if (IsSelfPred) {
-    S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
-    ScoreBrackets = S.get();
-  }
-
-  ScoreBrackets->clear();
-
-  // See if there are any uninitialized predecessors. If so, emit an
-  // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.count(Pred);
-    if (!Visited)
-      continue;
-    for (auto T : inst_counter_types()) {
-      uint32_t span =
-          PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
-      MaxPending[T] = std::max(MaxPending[T], span);
-      span =
-          PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
-      MaxFlat[T] = std::max(MaxFlat[T], span);
-    }
-  }
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
+                                 uint32_t OtherScore) {
+  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+  uint32_t OtherShifted =
+      OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
+  Score = std::max(MyShifted, OtherShifted);
+  return OtherShifted > MyShifted;
+}
+
+/// Merge the pending events and associater score brackets of \p Other into
+/// this brackets status.
+///
+/// Returns whether the merge resulted in a change that requires tighter waits
+/// (i.e. the merged brackets strictly dominate the original brackets).
+bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
+  bool StrictDom = false;
 
-  // Now set the current Block's brackets to the largest ending bracket.
   for (auto T : inst_counter_types()) {
-    ScoreBrackets->setScoreUB(T, MaxPending[T]);
-    ScoreBrackets->setScoreLB(T, 0);
-    ScoreBrackets->setLastFlat(T, MaxFlat[T]);
-  }
+    // Merge event flags for this counter
+    const bool OldOutOfOrder = counterOutOfOrder(T);
+    const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
+    const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+    if (OtherEvents & ~OldEvents)
+      StrictDom = true;
+    if (Other.MixedPendingEvents[T] ||
+        (OldEvents && OtherEvents && OldEvents != OtherEvents))
+      MixedPendingEvents[T] = true;
+    PendingEvents |= OtherEvents;
 
-  // Set the register scoreboard.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (!BlockVisitedSet.count(Pred)) {
-      continue;
-    }
+    // Merge scores for this counter
+    const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
+    const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+    MergeInfo M;
+    M.OldLB = ScoreLBs[T];
+    M.OtherLB = Other.ScoreLBs[T];
+    M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
+    M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
 
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
+    const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
+    if (NewUB < ScoreUBs[T])
+      report_fatal_error("waitcnt score overflow");
+    ScoreUBs[T] = NewUB;
+    ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
 
-    // Now merge the gpr_reg_score information
-    for (auto T : inst_counter_types()) {
-      uint32_t PredLB = PredScoreBrackets->getScoreLB(T);
-      uint32_t PredUB = PredScoreBrackets->getScoreUB(T);
-      if (PredLB < PredUB) {
-        uint32_t PredScale = MaxPending[T] - PredUB;
-        // Merge vgpr scores.
-        for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
-          uint32_t PredRegScore = PredScoreBrackets->getRegScore(J, T);
-          if (PredRegScore <= PredLB)
-            continue;
-          uint32_t NewRegScore = PredScale + PredRegScore;
-          ScoreBrackets->setRegScore(
-              J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
-        }
-        // Also need to merge sgpr scores for lgkm_cnt.
-        if (T == LGKM_CNT) {
-          for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
-            uint32_t PredRegScore =
-                PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
-            if (PredRegScore <= PredLB)
-              continue;
-            uint32_t NewRegScore = PredScale + PredRegScore;
-            ScoreBrackets->setRegScore(
-                J + NUM_ALL_VGPRS, LGKM_CNT,
-                std::max(
-                    ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
-                    NewRegScore));
-          }
-        }
-      }
-    }
+    StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
 
-    ScoreBrackets->mergePendingEvents(*PredScoreBrackets);
-  }
-
-  // if a single block loop, update the score brackets. Not needed for other
-  // blocks, as we did this in-place
-  if (IsSelfPred) {
-    BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
-  }
-}
-
-/// Return true if the given basic block is a "bottom" block of a loop.
-/// This works even if the loop is discontiguous. This also handles
-/// multiple back-edges for the same "header" block of a loop.
-bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
-                                    const MachineBasicBlock *Block) {
-  for (MachineBasicBlock *MBB : Loop->blocks()) {
-    if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
-      return true;
+    bool RegStrictDom = false;
+    for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
+         J++) {
+      RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
     }
-  }
-  return false;
-}
 
-/// Count the number of "bottom" basic blocks of a loop.
-unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
-  unsigned Count = 0;
-  for (MachineBasicBlock *MBB : Loop->blocks()) {
-    if (MBB->isSuccessor(Loop->getHeader())) {
-      Count++;
+    if (T == LGKM_CNT) {
+      for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
+           J != E; J++) {
+        RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
+      }
     }
+
+    if (RegStrictDom && !OldOutOfOrder)
+      StrictDom = true;
   }
-  return Count;
+
+  return StrictDom;
 }
 
 // Generate s_waitcnt instructions where needed.
-void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
-                                            MachineBasicBlock &Block) {
-  // Initialize the state information.
-  mergeInputScoreBrackets(Block);
-
-  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+                                            MachineBasicBlock &Block,
+                                            WaitcntBrackets &ScoreBrackets) {
+  bool Modified = false;
 
   LLVM_DEBUG({
     dbgs() << "*** Block" << Block.getNumber() << " ***";
-    ScoreBrackets->dump();
+    ScoreBrackets.dump();
   });
 
   // Walk over the instructions.
@@ -1381,10 +1248,11 @@
           // Two successive s_waitcnt's, both of which are pre-existing and
           // are therefore preserved.
           int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
-          ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+          ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
         } else {
           ++Iter;
           Inst.eraseFromParent();
+          Modified = true;
           continue;
         }
       }
@@ -1397,9 +1265,9 @@
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
         (!VCCZBugHandledSet.count(&Inst))) {
-      if (ScoreBrackets->getScoreLB(LGKM_CNT) <
-              ScoreBrackets->getScoreUB(LGKM_CNT) &&
-          ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
+      if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+              ScoreBrackets.getScoreUB(LGKM_CNT) &&
+          ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
           VCCZBugWorkAround = true;
       }
@@ -1407,10 +1275,10 @@
 
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
-    generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
     OldWaitcntInstr = nullptr;
 
-    updateEventWaitcntAfter(Inst, ScoreBrackets);
+    updateEventWaitcntAfter(Inst, &ScoreBrackets);
 
 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
     // If this instruction generates a S_SETVSKIP because it is an
@@ -1425,7 +1293,7 @@
 
     LLVM_DEBUG({
       Inst.print(dbgs());
-      ScoreBrackets->dump();
+      ScoreBrackets.dump();
     });
 
     // Check to see if this is a GWS instruction. If so, and if this is CI or
@@ -1437,7 +1305,7 @@
         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
-      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero());
+      ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero());
     }
 
     // TODO: Remove this work-around after fixing the scheduler and enable the
@@ -1450,70 +1318,13 @@
               AMDGPU::VCC)
           .addReg(AMDGPU::VCC);
       VCCZBugHandledSet.insert(&Inst);
+      Modified = true;
     }
 
     ++Iter;
   }
 
-  // Check if we need to force convergence at loop footer.
-  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
-  if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
-    LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-    WaitcntData->print();
-    LLVM_DEBUG(dbgs() << '\n';);
-
-    // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement, but doesn't guarantee convergence for a loop. Each
-    // loop should take at most (n+1) iterations for it to converge naturally,
-    // where n is the number of bottom blocks. If this threshold is reached and
-    // the result hasn't converged, then we force convergence by inserting
-    // a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
-      // To ensure convergence, need to make wait events at loop footer be no
-      // more than those from the previous iteration.
-      // As a simplification, instead of tracking individual scores and
-      // generating the precise wait count, just wait on 0.
-      bool HasPending = false;
-      MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
-      for (auto T : inst_counter_types()) {
-        if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
-          ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
-          HasPending = true;
-          break;
-        }
-      }
-
-      if (HasPending) {
-        if (!SWaitInst) {
-          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
-                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-                              .addImm(0);
-          TrackedWaitcntSet.insert(SWaitInst);
-#if 0 // TODO: Format the debug output
-          OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
-          OutputTransformAdd(SWaitInst, context);
-#endif
-        }
-#if 0 // TODO: ??
-        _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
-#endif
-      }
-
-      if (SWaitInst) {
-        LLVM_DEBUG({
-          SWaitInst->print(dbgs());
-          dbgs() << "\nAdjusted score board:";
-          ScoreBrackets->dump();
-        });
-
-        // Add this waitcnt to the block. It is either newly created or
-        // created in previous iterations and added back since block traversal
-        // always removes waitcnts.
-        insertWaitcntBeforeCF(Block, SWaitInst);
-        WaitcntData->setWaitcnt(SWaitInst);
-      }
-    }
-  }
+  return Modified;
 }
 
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
@@ -1521,7 +1332,6 @@
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
-  MLI = &getAnalysis<MachineLoopInfo>();
   IV = AMDGPU::getIsaVersion(ST->getCPU());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1546,93 +1356,70 @@
       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
 
   TrackedWaitcntSet.clear();
-  BlockVisitedSet.clear();
   VCCZBugHandledSet.clear();
-  LoopWaitcntDataMap.clear();
-  BlockWaitcntProcessedSet.clear();
+  RpotIdxMap.clear();
+  BlockInfos.clear();
 
-  // Walk over the blocks in reverse post order, inserting
-  // s_waitcnt where needed.
-  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  // Keep iterating over the blocks in reverse post order, inserting and
+  // updating s_waitcnt where needed, until a fix point is reached.
+  for (MachineBasicBlock *MBB :
+       ReversePostOrderTraversal<MachineFunction *>(&MF)) {
+    RpotIdxMap[MBB] = BlockInfos.size();
+    BlockInfos.emplace_back(MBB);
+  }
+
+  std::unique_ptr<WaitcntBrackets> Brackets;
   bool Modified = false;
-  for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
-           I = RPOT.begin(),
-           E = RPOT.end(), J = RPOT.begin();
-       I != E;) {
-    MachineBasicBlock &MBB = **I;
-
-    BlockVisitedSet.insert(&MBB);
-
-    BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
-    if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
-      ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
-    }
-    ScoreBrackets->setPostOrder(MBB.getNumber());
-    MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
-    if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
-      LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
-
-    // If we are walking into the block from before the loop, then guarantee
-    // at least 1 re-walk over the loop to propagate the information, even if
-    // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
-      unsigned Count = countNumBottomBlocks(ContainingLoop);
-
-      // If the loop has multiple back-edges, and so more than one "bottom"
-      // basic block, we have to guarantee a re-walk over every blocks.
-      if ((std::count(BlockWaitcntProcessedSet.begin(),
-                      BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
-        BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
-                          << ContainingLoop->getHeader()->getNumber() << '\n';);
-      }
-    }
+  bool Repeat;
+  do {
+    Repeat = false;
 
-    // Walk over the instructions.
-    insertWaitcntInBlock(MF, MBB);
-
-    // Record that waitcnts have been processed at least once for this block.
-    BlockWaitcntProcessedSet.push_back(&MBB);
-
-    // See if we want to revisit the loop. If a loop has multiple back-edges,
-    // we shouldn't revisit the same "bottom" basic block.
-    if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
-        std::count(BlockWaitcntProcessedSet.begin(),
-                   BlockWaitcntProcessedSet.end(), &MBB) == 1) {
-      MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
-      BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
-      if (EntrySB && EntrySB->getRevisitLoop()) {
-        EntrySB->setRevisitLoop(false);
-        J = I;
-        int32_t PostOrder = EntrySB->getPostOrder();
-        // TODO: Avoid this loop. Find another way to set I.
-        for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
-                 X = RPOT.begin(),
-                 Y = RPOT.end();
-             X != Y; ++X) {
-          MachineBasicBlock &MBBX = **X;
-          if (MBBX.getNumber() == PostOrder) {
-            I = X;
-            break;
-          }
-        }
-        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-        WaitcntData->incIterCnt();
-        LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
+    for (BlockInfo &BI : BlockInfos) {
+      if (!BI.Dirty)
         continue;
+
+      unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
+
+      if (BI.Incoming) {
+        if (!Brackets)
+          Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
+        else
+          *Brackets = *BI.Incoming;
       } else {
-        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
-        // Loop converged, reset iteration count. If this loop gets revisited,
-        // it must be from an outer loop, the counter will restart, this will
-        // ensure we don't force convergence on such revisits.
-        WaitcntData->resetIterCnt();
+        if (!Brackets)
+          Brackets = llvm::make_unique<WaitcntBrackets>(ST);
+        else
+          Brackets->clear();
+      }
+
+      Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
+      BI.Dirty = false;
+
+      if (Brackets->hasPending()) {
+        BlockInfo *MoveBracketsToSucc = nullptr;
+        for (MachineBasicBlock *Succ : BI.MBB->successors()) {
+          unsigned SuccIdx = RpotIdxMap[Succ];
+          BlockInfo &SuccBI = BlockInfos[SuccIdx];
+          if (!SuccBI.Incoming) {
+            SuccBI.Dirty = true;
+            if (SuccIdx <= Idx)
+              Repeat = true;
+            if (!MoveBracketsToSucc) {
+              MoveBracketsToSucc = &SuccBI;
+            } else {
+              SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
+            }
+          } else if (SuccBI.Incoming->merge(*Brackets)) {
+            SuccBI.Dirty = true;
+            if (SuccIdx <= Idx)
+              Repeat = true;
+          }
+        }
+        if (MoveBracketsToSucc)
+          MoveBracketsToSucc->Incoming = std::move(Brackets);
       }
     }
-
-    J = I;
-    ++I;
-  }
+  } while (Repeat);
 
   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
 
Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir
@@ -0,0 +1,47 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s
+
+# GCN-LABEL: name: irreducible_loop{{$}}
+# GCN: S_LOAD_DWORDX4_IMM
+# GCN: S_WAITCNT 127{{$}}
+# GCN: S_BUFFER_LOAD_DWORD_IMM
+# GCN: S_WAITCNT 127{{$}}
+# GCN: S_CMP_GE_I32
+--- |
+
+  define amdgpu_ps void @irreducible_loop() {
+  main:
+    ret void
+  }
+
+...
+---
+name:            irreducible_loop
+body:             |
+  bb.0:
+    successors: %bb.3, %bb.2
+
+    S_CBRANCH_VCCZ %bb.2, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.1:
+    successors: %bb.3, %bb.2
+
+    S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+
+  bb.2:
+    successors: %bb.3
+
+    renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0
+    renamable $sgpr3 = S_BUFFER_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0
+
+  bb.3:
+    successors: %bb.1, %bb.4
+
+    S_CMP_GE_I32 renamable $sgpr2, renamable $sgpr3, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.1, implicit killed $scc
+
+  bb.4:
+
+    S_ENDPGM
+
+...