Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1125,7 +1125,8 @@
     if (EmitSwaitcnt != 0) {
       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
-      if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+      if (!OldWaitcnt ||
+          (AMDGPU::decodeVmcnt(IV, Imm) !=
                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
           (AMDGPU::decodeExpcnt(IV, Imm) !=
            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
@@ -1143,7 +1144,7 @@
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
           }
           ScoreBracket->setRevisitLoop(true);
-          DEBUG(dbgs() << "set-revisit: block"
+          DEBUG(dbgs() << "set-revisit: Block"
                        << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
@@ -1151,18 +1152,14 @@
       // Update an existing waitcount, or make a new one.
       unsigned Enc = AMDGPU::encodeWaitcnt(IV, CntVal[VM_CNT],
                                            CntVal[EXP_CNT], CntVal[LGKM_CNT]);
-      // We don't (yet) track waitcnts that existed prior to the waitcnt
-      // pass (we just skip over them); because the waitcnt pass is ignorant
-      // of them, it may insert a redundant waitcnt. To avoid this, check
-      // the prev instr. If it and the to-be-inserted waitcnt are the
-      // same, keep the prev waitcnt and skip the insertion. We assume that
-      // whomever. e.g., for memory model, inserted the prev waitcnt really
-      // wants it there.
+      // We don't remove waitcnts that existed prior to the waitcnt
+      // pass. Check if the waitcnt to-be-inserted can be avoided
+      // or if the prev waitcnt can be updated.
       bool insertSWaitInst = true;
       for (MachineBasicBlock::iterator I = MI.getIterator(),
                                        B = MI.getParent()->begin();
            insertSWaitInst && I != B; --I) {
-        if (I ==  MI.getIterator())
+        if (I == MI.getIterator())
           continue;
 
         switch (I->getOpcode()) {
@@ -1585,15 +1582,16 @@
     MachineInstr &Inst = *Iter;
     // Remove any previously existing waitcnts.
     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
-      // TODO: Register the old waitcnt and optimize the following waitcnts.
-      // Leaving the previously existing waitcnts is conservatively correct.
+      // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
+      // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
+      // as needed.
       if (!TrackedWaitcntSet.count(&Inst))
         ++Iter;
       else {
-        ScoreBrackets->setWaitcnt(&Inst);
         ++Iter;
         Inst.removeFromParent();
       }
+      ScoreBrackets->setWaitcnt(&Inst);
       continue;
     }
 
@@ -1684,8 +1682,8 @@
     if (WaitcntData->getIterCnt() > 2) {
       // To ensure convergence, need to make wait events at loop footer be no
       // more than those from the previous iteration.
-      // As a simplification, Instead of tracking individual scores and
-      // generate the precise wait count, just wait on 0.
+      // As a simplification, instead of tracking individual scores and
+      // generating the precise wait count, just wait on 0.
       bool HasPending = false;
       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1722,7 +1720,7 @@
 
         // Add this waitcnt to the block. It is either newly created or
         // created in previous iterations and added back since block traversal
-        // always remove waitcnt.
+        // always removes waitcnts.
         insertWaitcntBeforeCF(Block, SWaitInst);
         WaitcntData->setWaitcnt(SWaitInst);
       }
@@ -1788,7 +1786,7 @@
     if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
         (!BlockWaitcntProcessedSet.count(&MBB))) {
       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-      DEBUG(dbgs() << "set-revisit: block"
+      DEBUG(dbgs() << "set-revisit: Block"
                    << ContainingLoop->getHeader()->getNumber() << '\n';);
     }
 
@@ -1819,7 +1817,7 @@
         }
         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
         WaitcntData->incIterCnt();
-        DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+        DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
         continue;
       } else {
         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
Index: test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
===================================================================
--- test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
+++ test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
@@ -1,24 +1,25 @@
 # RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s
 
 # Check that the waitcnt pass does *not* insert a redundant waitcnt instr.
-# In this testcase, ensure that pass does not insert redundant S_WAITCNT 127
-# or S_WAITCNT 3952
+# In this testcase, ensure that pass does not insert redundant S_WAITCNT 3952
 
 ...
 # CHECK-LABEL: name: waitcnt-no-redundant
-# CHECK: DS_READ_B64
-# CHECK-NEXT: S_WAITCNT 127
+# CHECK: S_WAITCNT 3952
 # CHECK-NEXT: FLAT_ATOMIC_CMPSWAP
 # CHECK-NEXT: S_WAITCNT 3952
-# CHECK-NEXT: BUFFER_WBINVL1_VOL
+# CHECK-NEXT: BUFFER_WBINVL1
 
 name: waitcnt-no-redundant
 body: |
-    bb.0:
-    renamable $vgpr0_vgpr1 = DS_READ_B64 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
-    S_WAITCNT 127
-    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr3_vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+  bb.0:
+    renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+
+  bb.1:
+    S_WAITCNT 3952
+    FLAT_ATOMIC_CMPSWAP undef renamable $vgpr0_vgpr1, renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     S_WAITCNT 3952
-    BUFFER_WBINVL1_VOL implicit $exec
-    S_ENDPGM
+    BUFFER_WBINVL1 implicit $exec
+    S_BRANCH %bb.1
+
 ...