Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -874,13 +874,13 @@
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
+  addPass(createSIMemoryLegalizerPass());
   if (EnableSIInsertWaitcntsPass)
     addPass(createSIInsertWaitcntsPass());
   else
     addPass(createSIInsertWaitsPass());
   addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
-  addPass(createSIMemoryLegalizerPass());
   addPass(createSIDebuggerInsertNopsPass());
   addPass(&BranchRelaxationPassID);
 }
Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -361,7 +361,7 @@
   AMDGPUAS AMDGPUASI;
 
   DenseSet<MachineBasicBlock *> BlockVisitedSet;
-  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+  DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseSet<MachineInstr *> VCCZBugHandledSet;
 
   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
@@ -1114,7 +1114,7 @@
           BlockWaitcntBrackets *ScoreBracket =
               BlockWaitcntBracketsMap[TBB].get();
           if (!ScoreBracket) {
-            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            assert(!BlockVisitedSet.count(TBB));
             BlockWaitcntBracketsMap[TBB] =
                 llvm::make_unique<BlockWaitcntBrackets>();
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
@@ -1132,7 +1132,7 @@
       } else {
         SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
                                           MI.getDebugLoc());
-        CompilerGeneratedWaitcntSet.insert(SWaitInst);
+        TrackedWaitcntSet.insert(SWaitInst);
       }
 
       const MachineOperand &Op =
@@ -1267,7 +1267,7 @@
   for (MachineBasicBlock *pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
         BlockWaitcntBracketsMap[pred].get();
-    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+    bool Visited = BlockVisitedSet.count(pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1306,7 +1306,7 @@
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
         BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1354,7 +1354,7 @@
 
   // Set the register scoreboard.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1468,7 +1468,7 @@
   // sequencing predecessors, because changes to EXEC require waitcnts due to
   // the delayed nature of these operations.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1530,8 +1530,7 @@
     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
       // TODO: Register the old waitcnt and optimize the following waitcnts.
       // Leaving the previously existing waitcnts is conservatively correct.
-      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
-          CompilerGeneratedWaitcntSet.end())
+      if (!TrackedWaitcntSet.count(&Inst))
         ++Iter;
       else {
         ScoreBrackets->setWaitcnt(&Inst);
@@ -1550,7 +1549,7 @@
 
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
-        (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+        (!VCCZBugHandledSet.count(&Inst))) {
       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
               ScoreBrackets->getScoreUB(LGKM_CNT) &&
           ScoreBrackets->hasPendingSMEM()) {
@@ -1564,11 +1563,29 @@
     MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
 
     if (SWaitInst) {
-      Block.insert(Inst, SWaitInst);
-      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
-        DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                     << "Old Instr: " << Inst << '\n'
-                     << "New Instr: " << *SWaitInst << '\n';);
+      // We don't (yet) track waitcnts that existed prior to the waitcnt
+      // pass (we just skip over them); because the waitcnt pass is ignorant
+      // of them, it may insert a redundant waitcnt. To avoid this, check
+      // the prev instr. If it and the to-be-inserted waitcnt are the
+      // same, keep the prev waitcnt and skip the insertion. We assume that
+      // whomever. e.g., for memory model, inserted the prev waitcnt really
+      // wants it there.
+      bool insertSWaitInst = true;
+      if (Iter != Block.begin()) {
+        MachineInstr *MIPrevInst = &*std::prev(Iter);
+        if (MIPrevInst &&
+            MIPrevInst->getOpcode() == AMDGPU::S_WAITCNT &&
+            MIPrevInst->getOperand(0).getImm() == SWaitInst->getOperand(0).getImm()) {
+          insertSWaitInst = false;
+        }
+      }
+      if (insertSWaitInst) {
+        Block.insert(Inst, SWaitInst);
+        if (ScoreBrackets->getWaitcnt() != SWaitInst) {
+          DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                       << "Old Instr: " << Inst << '\n'
+                       << "New Instr: " << *SWaitInst << '\n';);
+        }
       }
     }
 
@@ -1656,7 +1673,7 @@
         if (!SWaitInst) {
           SWaitInst = Block.getParent()->CreateMachineInstr(
               TII->get(AMDGPU::S_WAITCNT), DebugLoc());
-          CompilerGeneratedWaitcntSet.insert(SWaitInst);
+          TrackedWaitcntSet.insert(SWaitInst);
           const MachineOperand &Op = MachineOperand::CreateImm(0);
           SWaitInst->addOperand(MF, Op);
 #if 0 // TODO: Format the debug output
@@ -1712,6 +1729,10 @@
   RegisterEncoding.SGPRL =
       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
 
+  TrackedWaitcntSet.clear();
+  BlockVisitedSet.clear();
+  VCCZBugHandledSet.clear();
+
   // Walk over the blocks in reverse post-dominator order, inserting
   // s_waitcnt where needed.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
@@ -1738,8 +1759,7 @@
     // at least 1 re-walk over the loop to propagate the information, even if
     // no S_WAITCNT instructions were generated.
     if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
-        (BlockWaitcntProcessedSet.find(&MBB) ==
-         BlockWaitcntProcessedSet.end())) {
+        (!BlockWaitcntProcessedSet.count(&MBB))) {
       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
       DEBUG(dbgs() << "set-revisit: block"
                    << ContainingLoop->getHeader()->getNumber() << '\n';);
Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
@@ -0,0 +1,24 @@
+# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s
+
+# Check that the waitcnt pass does *not* insert a redundant waitcnt instr.
+# In this testcase, ensure that pass does not insert redundant S_WAITCNT 127
+# or S_WAITCNT 3952
+
+...
+# CHECK-LABEL: name: waitcnt-no-redundant
+# CHECK: DS_READ_B64
+# CHECK-NEXT: S_WAITCNT 127
+# CHECK-NEXT: FLAT_ATOMIC_CMPSWAP
+# CHECK-NEXT: S_WAITCNT 3952
+# CHECK-NEXT: BUFFER_WBINVL1_VOL
+
+name: waitcnt-no-redundant
+body: |
+    bb.0:
+    renamable $vgpr0_vgpr1 = DS_READ_B64 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    S_WAITCNT 127
+    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr3_vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+    S_WAITCNT 3952
+    BUFFER_WBINVL1_VOL implicit $exec
+    S_ENDPGM
+...