Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -46,6 +46,7 @@
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
+FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
 
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
@@ -123,6 +124,9 @@
 void initializeSIInsertWaitsPass(PassRegistry&);
 extern char &SIInsertWaitsID;
 
+void initializeSIInsertWaitcntsPass(PassRegistry&);
+extern char &SIInsertWaitcntsID;
+
 void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
 extern char &AMDGPUUnifyDivergentExitNodesID;
 
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -105,6 +105,12 @@
   cl::desc("Enable AMDGPU Alias Analysis"),
   cl::init(true));
 
+// Option to enable new waitcnt insertion pass.
+static cl::opt<bool> EnableSIInsertWaitcntsPass(
+  "enable-si-insert-waitcnts",
+  cl::desc("Use new waitcnt insertion pass"),
+  cl::init(true));
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -127,6 +133,7 @@
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertWaitsPass(*PR);
+  initializeSIInsertWaitcntsPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIInsertSkipsPass(*PR);
@@ -794,7 +801,10 @@
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
-  addPass(createSIInsertWaitsPass());
+  if (EnableSIInsertWaitcntsPass)
+    addPass(createSIInsertWaitcntsPass());
+  else
+    addPass(createSIInsertWaitsPass());
   addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
   addPass(createSIDebuggerInsertNopsPass());
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -82,6 +82,7 @@
   SIFrameLowering.cpp
   SIInsertSkips.cpp
   SIInsertWaits.cpp
+  SIInsertWaitcnts.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILoadStoreOptimizer.cpp
Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -0,0 +1,1860 @@
+//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/
+
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "si-insert-waitcnts"
+
+using namespace llvm;
+
+namespace {
+
+// Class of object that encapsulates latest instruction counter score
+// associated with the operand.  Used for determining whether
+// s_waitcnt instruction needs to be emited.
+
+#define CNT_MASK(t) (1u << (t))
+
+enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+struct {
+  int32_t VmcntMax;
+  int32_t ExpcntMax;
+  int32_t LgkmcntMax;
+  int32_t NumVGPRsMax;
+  int32_t NumSGPRsMax;
+} HardwareLimits;
+
+#define ForAllInstCounterType(t)                                               \
+  for (enum InstCounterType t = VM_CNT; (t) < NUM_INST_CNTS;                   \
+       (t) = (enum InstCounterType)((t) + 1))
+
+enum WaitEventType {
+  VMEM_ACCESS,      // vector-memory read & write
+  LDS_ACCESS,       // lds read & write
+  GDS_ACCESS,       // gds read & write
+  SQ_MESSAGE,       // send message
+  SMEM_ACCESS,      // scalar-memory read & write
+  EXP_GPR_LOCK,     // export holding on its data src
+  GDS_GPR_LOCK,     // GDS holding on its data and addr src
+  EXP_POS_ACCESS,   // write to export position
+  EXP_PARAM_ACCESS, // write to export parameter
+  VMW_GPR_LOCK,     // vector-memory write holding on its data src
+  NUM_WAIT_EVENTS,
+};
+
+// The mapping is:
+//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
+//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
+//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
+// We reserve a fixed number of VGPR slots in the scoring tables for
+// special tokens like SCMEM_LDS (needed for buffer load to LDS).
+enum RegisterMapping {
+  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
+  SQ_MAX_PGM_SGPRS = 104, // Maximum programmable SGPRs across all targets.
+  NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
+  EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
+  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
+};
+
+#define ForAllWaitEventType(w)                                                 \
+  for (enum WaitEventType w = (enum WaitEventType)0;                           \
+       (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
+       (w) = (enum WaitEventType)((w) + 1))
+
+// This is a per-basic-block object that maintains current score brackets
+// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait-count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class BlockWaitcntBrackets {
+public:
+  static int32_t getWaitCountMax(InstCounterType T) {
+    switch (T) {
+    case VM_CNT:
+      return HardwareLimits.VmcntMax;
+    case LGKM_CNT:
+      return HardwareLimits.LgkmcntMax;
+    case EXP_CNT:
+      return HardwareLimits.ExpcntMax;
+    default:
+      break;
+    }
+    return 0;
+  };
+
+  void setScoreLB(InstCounterType T, int32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreLBs[T] = Val;
+  };
+
+  void setScoreUB(InstCounterType T, int32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreUBs[T] = Val;
+    if (T == EXP_CNT) {
+      int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
+      if (ScoreLBs[T] < UB)
+        ScoreLBs[T] = UB;
+    }
+  };
+
+  int32_t getScoreLB(InstCounterType T) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return 0;
+    return ScoreLBs[T];
+  };
+
+  int32_t getScoreUB(InstCounterType T) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return 0;
+    return ScoreUBs[T];
+  };
+
+  // Mapping from event to counter.
+  InstCounterType eventCounter(WaitEventType E) {
+    switch (E) {
+    case VMEM_ACCESS:
+      return VM_CNT;
+    case LDS_ACCESS:
+    case GDS_ACCESS:
+    case SQ_MESSAGE:
+    case SMEM_ACCESS:
+      return LGKM_CNT;
+    case EXP_GPR_LOCK:
+    case GDS_GPR_LOCK:
+    case VMW_GPR_LOCK:
+    case EXP_POS_ACCESS:
+    case EXP_PARAM_ACCESS:
+      return EXP_CNT;
+    default:
+      llvm_unreachable("unhandled event type");
+    }
+    return NUM_INST_CNTS;
+  }
+
+  void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      if (GprNo > VgprUB) {
+        VgprUB = GprNo;
+      }
+      VgprScores[T][GprNo] = Val;
+    } else {
+      assert(T == LGKM_CNT);
+      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
+        SgprUB = GprNo - NUM_ALL_VGPRS;
+      }
+      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+    }
+  }
+
+  int32_t getRegScore(int GprNo, InstCounterType T) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      return VgprScores[T][GprNo];
+    }
+    return SgprScores[GprNo - NUM_ALL_VGPRS];
+  }
+
+  void clear() {
+    memset(ScoreLBs, 0, sizeof(ScoreLBs));
+    memset(ScoreUBs, 0, sizeof(ScoreUBs));
+    memset(EventUBs, 0, sizeof(EventUBs));
+    ForAllInstCounterType(t) {
+      memset(VgprScores[t], 0, sizeof(VgprScores[t]));
+    }
+    memset(SgprScores, 0, sizeof(SgprScores));
+  }
+
+  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
+                             const MachineRegisterInfo *MRI,
+                             const SIRegisterInfo *TRI, unsigned OpNo) const;
+
+  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+                   unsigned OpNo, int32_t Val);
+
+  bool isRegisterSpecial(const MachineOperand &Reg);
+
+  void setWaitAtBeginning() { WaitAtBeginning = true; }
+  void clearWaitAtBeginning() { WaitAtBeginning = false; }
+  bool getWaitAtBeginning() const { return WaitAtBeginning; }
+  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
+  int32_t getMaxVGPR() const { return VgprUB; }
+  int32_t getMaxSGPR() const { return SgprUB; }
+  int32_t getEventUB(enum WaitEventType W) const {
+    assert(W < NUM_WAIT_EVENTS);
+    return EventUBs[W];
+  }
+  bool counterOutOfOrder(InstCounterType T);
+  unsigned int updateByWait(InstCounterType T, int ScoreToWait);
+  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+                     const MachineRegisterInfo *MRI, WaitEventType E,
+                     MachineInstr &MI);
+
+  BlockWaitcntBrackets()
+      : WaitAtBeginning(false), ValidLoop(false), MixedExpTypes(false),
+        LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) {
+    ForAllInstCounterType(T) {
+      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+    }
+  }
+  ~BlockWaitcntBrackets(){};
+
+  bool hasPendingSMEM() const {
+    return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
+            EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+  }
+
+  bool hasPendingFlat() const {
+    return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
+             LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
+            (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
+             LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
+  }
+
+  void setPendingFlat() {
+    LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
+    LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
+  }
+
+  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
+
+  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
+
+  bool getRevisitLoop() const { return RevisitLoop; }
+  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
+
+  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
+  int32_t getPostOrder() const { return PostOrder; }
+
+  void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
+  void clearWaitcnt() { Waitcnt = NULL; }
+  MachineInstr *getWaitcnt() const { return Waitcnt; }
+
+  bool mixedExpTypes() const { return MixedExpTypes; }
+  void setMixedExpTypes(bool MixedExpTypesIn) {
+    MixedExpTypes = MixedExpTypesIn;
+  }
+
+  void print(raw_ostream &);
+  void dump() { print(dbgs()); }
+
+private:
+  bool WaitAtBeginning;
+  bool RevisitLoop;
+  bool ValidLoop;
+  bool MixedExpTypes;
+  MachineLoop *LoopRegion;
+  int32_t PostOrder;
+  MachineInstr *Waitcnt;
+  int32_t ScoreLBs[NUM_INST_CNTS] = {0};
+  int32_t ScoreUBs[NUM_INST_CNTS] = {0};
+  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+  // Remember the last flat memory operation.
+  int32_t LastFlat[NUM_INST_CNTS] = {0};
+  // wait_cnt scores for every vgpr.
+  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+  int32_t VgprUB;
+  int32_t SgprUB;
+  int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+  int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+};
+
+// This is a per-loop-region object that records waitcnt status at the end of
+// loop footer from the previous iteration. We also maintain an iteration
+// count to track the number of times the loop has been visited. When it
+// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
+// at the end of the loop footer.
+class LoopWaitcntData {
+public:
+  void incIterCnt() { IterCnt++; }
+  void resetIterCnt() { IterCnt = 0; }
+  int32_t getIterCnt() { return IterCnt; }
+
+  LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {}
+  ~LoopWaitcntData(){};
+
+  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
+  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
+
+  void print() {
+    DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
+    return;
+  }
+
+private:
+  // s_waitcnt added at the end of loop footer to stablize wait scores
+  // at the end of the loop footer.
+  MachineInstr *LfWaitcnt;
+  // Number of iterations the loop has been visited, not including the initial
+  // walk over.
+  int32_t IterCnt;
+};
+
+class SIInsertWaitcnts : public MachineFunctionPass {
+
+private:
+  const SISubtarget *ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  const MachineLoopInfo *MLI;
+
+  DenseSet<MachineBasicBlock *> BlockVisitedSet;
+  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+
+  DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
+      BlockWaitcntBracketsMap;
+
+  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+
+  DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
+
+  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
+
+public:
+  static char ID;
+
+  SIInsertWaitcnts()
+      : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr),
+        MRI(nullptr), MLI(nullptr) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI insert wait instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
+    // The waitcnt information is copied because it changes as the block is
+    // traversed.
+    KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket));
+  }
+
+  MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
+                                           BlockWaitcntBrackets *ScoreBrackets);
+  void updateEventWaitCntAfter(MachineInstr &Inst,
+                               BlockWaitcntBrackets *ScoreBrackets);
+  void mergeInputScoreBrackets(MachineBasicBlock &Block);
+  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
+  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+};
+
+} // End anonymous namespace.
+
+RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
+                                                 const SIInstrInfo *TII,
+                                                 const MachineRegisterInfo *MRI,
+                                                 const SIRegisterInfo *TRI,
+                                                 unsigned OpNo) const {
+  const MachineOperand &Reg = MI->getOperand(OpNo);
+  const MachineRegisterInfo &MRIA = *MRI;
+  int Offset =
+      TRI->isVGPR(MRIA, Reg.getReg()) ? -SQ_MAX_PGM_VGPRS : NUM_ALL_VGPRS;
+
+  const MachineInstr &MIA = *MI;
+  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
+  unsigned Size = RC->getSize();
+  assert(Size >= 4);
+
+  RegInterval Result;
+  Result.first = TRI->getEncodingValue(Reg.getReg()) + Offset;
+  Result.second = Result.first + Size / 4;
+
+  return Result;
+}
+
+void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
+                                       const SIInstrInfo *TII,
+                                       const SIRegisterInfo *TRI,
+                                       const MachineRegisterInfo *MRI,
+                                       unsigned OpNo, int32_t Val) {
+  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
+  DEBUG({
+    const MachineOperand &Opnd = MI->getOperand(OpNo);
+    assert(TRI->isVGPR(*MRI, Opnd.getReg()));
+  });
+  for (unsigned RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+    setRegScore(RegNo, EXP_CNT, Val);
+  }
+}
+
+bool BlockWaitcntBrackets::isRegisterSpecial(const MachineOperand &Reg) {
+  if (Reg.getReg() == AMDGPU::SCC || Reg.getReg() == AMDGPU::VCC ||
+      Reg.getReg() == AMDGPU::VCC_LO || Reg.getReg() == AMDGPU::VCC_HI ||
+      Reg.getReg() == AMDGPU::EXEC_LO || Reg.getReg() == AMDGPU::EXEC_HI ||
+      Reg.getReg() == AMDGPU::EXEC || Reg.getReg() == AMDGPU::M0)
+    return true;
+  return false;
+}
+
+void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+                                         const SIRegisterInfo *TRI,
+                                         const MachineRegisterInfo *MRI,
+                                         WaitEventType E, MachineInstr &Inst) {
+  const MachineRegisterInfo &MRIA = *MRI;
+  InstCounterType T = eventCounter(E);
+  int32_t CurrScore = getScoreUB(T) + 1;
+  // EventUB and ScoreUB need to be update regardless if this event changes
+  // the score of a register or not.
+  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
+  EventUBs[E] = CurrScore;
+  setScoreUB(T, CurrScore);
+
+  if (T == EXP_CNT) {
+    // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
+    // is required.
+    if (!MixedExpTypes) {
+      MixedExpTypes = counterOutOfOrder(EXP_CNT);
+    }
+
+    // Put score on the source vgprs. If this is a store, just use those
+    // specific register(s).
+    if (TII->isDS(Inst)) {
+      // All GDS operations must protect their address register (same as
+      // export.)
+      if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
+          Inst.getOpcode() != AMDGPU::DS_CONSUME) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
+            CurrScore);
+      }
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
+            CurrScore);
+        if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                       AMDGPU::OpName::data1) != -1) {
+          setExpScore(&Inst, TII, TRI, MRI,
+                      AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                                 AMDGPU::OpName::data1),
+                      CurrScore);
+        }
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
+                 Inst.getOpcode() != AMDGPU::DS_APPEND &&
+                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
+                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          const MachineOperand &Op = Inst.getOperand(I);
+          if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
+            setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+          }
+        }
+      }
+    } else if (TII->isFLAT(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMIMG(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMTBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      }
+    } else if (TII->isMUBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else {
+      if (TII->isEXP(Inst)) {
+        // For export the destination registers are really temps that
+        // can be used as the actual source after export patching, so
+        // we need to treat them like sources and set the EXP_CNT
+        // score.
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          MachineOperand &DefMO = Inst.getOperand(I);
+          if (DefMO.isReg() && DefMO.isDef() &&
+              TRI->isVGPR(MRIA, DefMO.getReg())) {
+            setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
+                        CurrScore);
+          }
+        }
+      }
+      for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+        MachineOperand &MO = Inst.getOperand(I);
+        if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
+          setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+        }
+      }
+    }
+#if 0 // TODO: check if this is handled by MUBUF code above.
+  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
+	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
+	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
+    MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
+    unsigned OpNo;//TODO: find the OpNo for this operand;
+    RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
+    for (unsigned RegNo = Interval.first; RegNo < Interval.second;
+	 ++RegNo) {
+      setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
+    }
+#endif
+  } else {
+    // Match the score to the destination registers.
+    for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+      MachineOperand &Def = Inst.getOperand(I);
+      if (!Def.isReg() || !Def.isDef() || isRegisterSpecial(Def)) {
+        continue;
+      }
+      RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
+      if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
+        continue;
+      for (unsigned RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        setRegScore(RegNo, T, CurrScore);
+      }
+    }
+    if (TII->isDS(Inst) && Inst.mayStore()) {
+      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+    }
+  }
+}
+
+void BlockWaitcntBrackets::print(raw_ostream &OS) {
+  OS << '\n';
+  ForAllInstCounterType(T) {
+    int LB = getScoreLB(T);
+    int UB = getScoreUB(T);
+
+    switch (T) {
+    case VM_CNT:
+      OS << "    VM_CNT(" << UB - LB << "): ";
+      break;
+    case LGKM_CNT:
+      OS << "    LGKM_CNT(" << UB - LB << "): ";
+      break;
+    case EXP_CNT:
+      OS << "    EXP_CNT(" << UB - LB << "): ";
+      break;
+    default:
+      OS << "    UNKNOWN(" << UB - LB << "): ";
+      break;
+    }
+
+    if (LB < UB) {
+      // Print vgpr scores.
+      for (int J = 0; J <= getMaxVGPR(); J++) {
+        int RegScore = getRegScore(J, T);
+        if (RegScore <= LB)
+          continue;
+        int RelScore = RegScore - LB - 1;
+        if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
+          OS << RelScore << ":v" << J << " ";
+        } else {
+          OS << RelScore << ":ds ";
+        }
+      }
+      // Also need to print sgpr scores for lgkm_cnt.
+      if (T == LGKM_CNT) {
+        for (int J = 0; J <= getMaxSGPR(); J++) {
+          int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+          if (RegScore <= LB)
+            continue;
+          int RelScore = RegScore - LB - 1;
+          OS << RelScore << ":s" << J << " ";
+        }
+      }
+    }
+    OS << '\n';
+  }
+  OS << '\n';
+  return;
+}
+
+unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
+                                                int ScoreToWait) {
+  unsigned int NeedWait = 0;
+  if (ScoreToWait == -1) {
+    // The score to wait is unknown. This implies that it was not encountered
+    // during the path of the CFG walk done during the current traversal but
+    // may be seen on a different path. Emit an s_wait counter with a
+    // conservative value of 0 for the counter.
+    NeedWait = CNT_MASK(T);
+    setScoreLB(T, getScoreUB(T));
+    return NeedWait;
+  }
+
+  // If the score of src_operand falls within the bracket, we need an
+  // s_waitcnt instruction.
+  const int32_t LB = getScoreLB(T);
+  const int32_t UB = getScoreUB(T);
+  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+    if (T == VM_CNT && hasPendingFlat()) {
+      // If there is a pending FLAT operation, and this is a VM waitcnt,
+      // then we need to force a waitcnt 0 for VM.
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, getScoreUB(T));
+    } else if (counterOutOfOrder(T)) {
+      // Counter can get decremented out-of-order when there
+      // are multiple types event in the brack. Also emit an s_wait counter
+      // with a conservative value of 0 for the counter.
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, getScoreUB(T));
+    } else {
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, ScoreToWait);
+    }
+  }
+
+  return NeedWait;
+}
+
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
+  switch (T) {
+  case VM_CNT:
+    return false;
+  case LGKM_CNT: {
+    if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      // Scalar memory read always can go out of order.
+      return true;
+    }
+    int NumEventTypes = 0;
+    if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (NumEventTypes <= 1) {
+      return false;
+    }
+    break;
+  }
+  case EXP_CNT: {
+    // If there has been a mixture of export types, then a waitcnt exp(0) is
+    // required.
+    if (MixedExpTypes)
+      return true;
+    int NumEventTypes = 0;
+    if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+
+    if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+
+    if (NumEventTypes <= 1) {
+      return false;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
+                      false)
+INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
+                    false)
+
+char SIInsertWaitcnts::ID = 0;
+
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
+
+FunctionPass *llvm::createSIInsertWaitcntsPass() {
+  return new SIInsertWaitcnts();
+}
+
+static bool readsVCCZ(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+         !MI.getOperand(1).isUndef();
+}
+
+/**
+ *  @brief Generate s_waitcnt instruction to be placed before cur_Inst.
+ *
+ *         Instructions of a given type are returned in order,
+ *         but instructions of different types can complete out of order.
+ *         We rely on this in-order completion
+ *         and simply assign a score to the memory access instructions.
+ *         We keep track of the active "score bracket" to determine
+ *         if an access of a memory read requires an s_waitcnt
+ *         and if so what the value of each counter is.
+ *         The "score bracket" is bound by the lower bound and upper bound
+ *         scores (*_score_LB and *_score_ub respectively).
+ */
+MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+    MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
+  // To emit, or not to emit - that's the question!
+  // Start with an assumption that there is no need to emit.
+  unsigned int EmitSwaitcnt = 0;
+  // s_waitcnt instruction to return; default is NULL.
+  MachineInstr *SWaitInst = NULL;
+  // No need to wait before phi. If a phi-move exists, then the wait should
+  // has been inserted before the move. If a phi-move does not exist, then
+  // wait should be inserted before the real use. The same is true for
+  // sc-merge. It is not a coincident that all these cases correspond to the
+  // instructions that are skipped in the assembling loop.
+  bool NeedLinemapping = false; // TODO: Check on this.
+  if (MI.getOpcode() == AMDGPU::DBG_VALUE &&
+      // TODO: any other opcode?
+      !NeedLinemapping) {
+    return SWaitInst;
+  }
+
+  // See if an s_waitcnt is forced at block entry, or is needed at
+  // program end.
+  if (ScoreBrackets->getWaitAtBeginning()) {
+    // Note that we have already cleared the state, so we don't need to update
+    // it.
+    ScoreBrackets->clearWaitAtBeginning();
+    ForAllInstCounterType(t) {
+      EmitSwaitcnt |= CNT_MASK(t);
+      ScoreBrackets->setScoreLB(t, ScoreBrackets->getScoreUB(t));
+    }
+  }
+
+  // See if this instruction has a forced S_WAITCNT VM.
+  // TODO: Handle other cases of NeedsWaitcntVmBefore()
+  else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
+           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
+           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+    EmitSwaitcnt |=
+        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+  }
+
+  // All waits must be resolved at call return.
+  // NOTE: this could be improved with knowledge of all call sites or
+  //   with knowledge of the called routines.
+  if (MI.getOpcode() == AMDGPU::RETURN || 
+      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+    ForAllInstCounterType(t) {
+      if (ScoreBrackets->getScoreUB(t) > ScoreBrackets->getScoreLB(t)) {
+        ScoreBrackets->setScoreLB(t, ScoreBrackets->getScoreUB(t));
+        EmitSwaitcnt |= CNT_MASK(t);
+      }
+    }
+  }
+  // Resolve vm waits before gs-done.
+  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
+            MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
+            AMDGPU::SendMsg::ID_GS_DONE)) {
+    if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
+      ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      EmitSwaitcnt |= CNT_MASK(VM_CNT);
+    }
+  }
+#if 0 // TODO: the following blocks of logic when we have fence.
+  else if (MI.getOpcode() == SC_FENCE) {
+    const unsigned int group_size =
+      context->shader_info->GetMaxThreadGroupSize();
+    // group_size == 0 means thread group size is unknown at compile time
+    const bool group_is_multi_wave =
+      (group_size == 0 || group_size > target_info->GetWaveFrontSize());
+    const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
+
+    for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
+      SCRegType src_type = Inst->GetSrcType(i);
+      switch (src_type) {
+        case SCMEM_LDS:
+          if (group_is_multi_wave ||
+	      context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+			       ScoreBrackets->getScoreUB(LGKM_CNT));
+            // LDS may have to wait for VM_CNT after buffer load to LDS
+            if (target_info->HasBufferLoadToLDS()) {
+              EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+                                 ScoreBrackets->getScoreUB(VM_CNT));
+            }
+          }
+          break;
+
+        case SCMEM_GDS:
+          if (group_is_multi_wave || fence_is_global) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+			       ScoreBrackets->getScoreUB(EXP_CNT));
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+			       ScoreBrackets->getScoreUB(LGKM_CNT));
+          }
+          break;
+
+        case SCMEM_UAV:
+        case SCMEM_TFBUF:
+        case SCMEM_RING:
+        case SCMEM_SCATTER:
+          if (group_is_multi_wave || fence_is_global) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+			       ScoreBrackets->getScoreUB(EXP_CNT));
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+			       ScoreBrackets->getScoreUB(VM_CNT));
+          }
+          break;
+
+        case SCMEM_SCRATCH:
+        default:
+          break;
+      }
+    }
+  }
+#endif
+
+  // Export & GDS instructions do not read the EXEC mask until after the export
+  // is granted (which can occur well after the instruction is issued).
+  // The shader program must flush all EXP operations on the export-count
+  // before overwriting the EXEC mask.
+  else {
+    if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+      // Export and GDS are tracked individually, either may trigger a waitcnt
+      // for EXEC.
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
+    }
+
+#if 0 // TODO: the following code to handle CALL.
+    // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
+    // However, there is a problem with EXP_CNT, because the call cannot
+    // easily tell if a register is used in the function, and if it did, then
+    // the referring instruction would have to have an S_WAITCNT, which is
+    // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
+    // before the call.
+    if (MI.getOpcode() == SC_CALL) {
+      if (ScoreBrackets->getScoreUB(EXP_CNT) >
+	  ScoreBrackets->getScoreLB(EXP_CNT)) {
+        ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+        EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+      }
+    }
+#endif
+
+    // Look at the source operands of every instruction to see if
+    // any of them results from a previous memory operation that affects
+    // its current usage. If so, an s_waitcnt instruction needs to be
+    // emitted.
+    // If the source operand was defined by a load, add the s_waitcnt
+    // instruction.
+    for (const MachineMemOperand *Memop : MI.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS != AMDGPUAS::LOCAL_ADDRESS)
+        continue;
+      unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+      // VM_CNT is only relevant to vgpr or LDS.
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+    }
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      const MachineOperand &Op = MI.getOperand(I);
+      if (!Op.isReg() || Op.isDef())
+        continue;
+      // A use via a PW operand does not need a waitcnt.
+      assert(!Op.getSubReg() || !Op.isUndef());
+      if (ScoreBrackets->isRegisterSpecial(Op))
+        continue;
+      const MachineRegisterInfo &MRIA = *MRI;
+      RegInterval Interval =
+          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I);
+      for (unsigned RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        if (TRI->isVGPR(MRIA, Op.getReg())) {
+          // VM_CNT is only relevant to vgpr or LDS.
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+        }
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+      }
+    }
+    // End of for loop that looks at all source operands to decide vm_wait_cnt
+    // and lgk_wait_cnt.
+
+    // Two cases are handled for destination operands:
+    // 1) If the destination operand was defined by a load, add the s_waitcnt
+    // instruction to guarantee the right WAW order.
+    // 2) If a destination operand that was used by a recent export/store ins,
+    // add s_waitcnt on exp_cnt to guarantee the WAR order.
+    if (MI.mayStore()) {
+      for (const MachineMemOperand *Memop : MI.memoperands()) {
+        unsigned AS = Memop->getAddrSpace();
+        if (AS != AMDGPUAS::LOCAL_ADDRESS)
+          continue;
+        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+      }
+    }
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      MachineOperand &Def = MI.getOperand(I);
+      if (!Def.isReg() || !Def.isDef())
+        continue;
+      // A partial write is not a WAW.
+      assert(!Def.getSubReg() || !Def.isUndef());
+      if (ScoreBrackets->isRegisterSpecial(Def))
+        continue;
+      const MachineRegisterInfo &MRIA = *MRI;
+      RegInterval Interval =
+          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I);
+      for (unsigned RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        if (TRI->isVGPR(MRIA, Def.getReg())) {
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+        }
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+      }
+    } // End of for loop that looks at all dest operands.
+  }
+
+  // TODO: Tie force zero to a compiler triage option.
+  bool ForceZero = false;
+
+  if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) {
+    EmitSwaitcnt = true;
+  }
+
+  // TODO: Remove this work-around, enable the assert for Bug 457939
+  //       after fixing the scheduler. Also, the Shader Compiler code is
+  //       independent of target.
+  if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+    if (ScoreBrackets->getScoreLB(LGKM_CNT) <
+            ScoreBrackets->getScoreUB(LGKM_CNT) &&
+        ScoreBrackets->hasPendingSMEM()) {
+      // Wait on everything, not just LGKM.  vccz reads usually come from
+      // terminators, and we always wait on everything at the end of the
+      // block, so if we only wait on LGKM here, we might end up with
+      // another s_waitcnt inserted right after this if there are non-LGKM
+      // instructions still outstanding.
+      ForceZero = true;
+      EmitSwaitcnt = true;
+    }
+  }
+
+  // Does this operand processing indicate s_wait counter update?
+  if (EmitSwaitcnt) {
+    int CntVal[NUM_INST_CNTS];
+
+    bool UseDefaultWaitcntStrategy = true;
+    if (ForceZero) {
+      // Force all waitcnts to 0.
+      ForAllInstCounterType(t) {
+        ScoreBrackets->setScoreLB(t, ScoreBrackets->getScoreUB(t));
+      }
+      CntVal[VM_CNT] = 0;
+      CntVal[EXP_CNT] = 0;
+      CntVal[LGKM_CNT] = 0;
+      UseDefaultWaitcntStrategy = false;
+    }
+
+    if (UseDefaultWaitcntStrategy) {
+      ForAllInstCounterType(T) {
+        if (EmitSwaitcnt & CNT_MASK(T)) {
+          int Delta =
+              ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
+          int MaxDelta = ScoreBrackets->getWaitCountMax(T);
+          if (Delta >= MaxDelta) {
+            Delta = -1;
+            if (T != EXP_CNT) {
+              ScoreBrackets->setScoreLB(
+                  T, ScoreBrackets->getScoreUB(T) - MaxDelta);
+            }
+            EmitSwaitcnt &= ~CNT_MASK(T);
+          }
+          CntVal[T] = Delta;
+        } else {
+          // If we are not waiting for a particular counter then encode
+          // it as -1 which means "don't care."
+          CntVal[T] = -1;
+        }
+      }
+    }
+
+    // If we are not waiting on any counter we can skip the wait altogether.
+    if (EmitSwaitcnt != 0) {
+      MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
+      int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
+      if (!OldWaitcnt || ((Imm & 0xF) != (CntVal[VM_CNT] & 0xF)) ||
+          (((Imm >> 4) & 0x7) != (CntVal[EXP_CNT] & 0x7)) ||
+          (((Imm >> 8) & 0xF) != (CntVal[LGKM_CNT] & 0xF))) {
+        MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
+        if (ContainingLoop) {
+          MachineBasicBlock *TBB = ContainingLoop->getTopBlock();
+          BlockWaitcntBrackets *ScoreBracket =
+              BlockWaitcntBracketsMap[TBB].get();
+          if (!ScoreBracket) {
+            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>();
+            ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
+          }
+          ScoreBracket->setRevisitLoop(true);
+          DEBUG(dbgs() << "set-revisit: block"
+                       << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+        }
+      }
+
+      // Update an existing waitcount, or make a new one.
+      MachineFunction &MF = *MI.getParent()->getParent();
+      if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
+        SWaitInst = OldWaitcnt;
+      } else {
+        SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
+                                          MI.getDebugLoc());
+        CompilerGeneratedWaitcntSet.insert(SWaitInst);
+      }
+
+      int64_t Val = (CntVal[VM_CNT] & 0xF) | ((CntVal[EXP_CNT] & 0x7) << 4) |
+                    ((CntVal[LGKM_CNT] & 0xF) << 8);
+      const MachineOperand &Op = MachineOperand::CreateImm(Val);
+      SWaitInst->addOperand(MF, Op);
+
+      if (CntVal[EXP_CNT] == 0) {
+        ScoreBrackets->setMixedExpTypes(false);
+      }
+    }
+  }
+
+  return SWaitInst;
+}
+
+void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
+                                             MachineInstr *Waitcnt) {
+  if (MBB.empty()) {
+    MBB.push_back(Waitcnt);
+    return;
+  }
+
+  MachineBasicBlock::iterator It = MBB.end();
+  MachineInstr *MI = &*(--It);
+  if (MI->isBranch()) {
+    MBB.insert(It, Waitcnt);
+  } else {
+    MBB.push_back(Waitcnt);
+  }
+
+  return;
+}
+
+void SIInsertWaitcnts::updateEventWaitCntAfter(
+    MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
+  // Now look at the instruction opcode. If it is a memory access
+  // instruction, update the upper-bound of the appropriate counter's
+  // bracket and the destination operand scores.
+  if (TII->isDS(Inst)) {
+    bool FoundLDS = false;
+    for (const MachineMemOperand *Memop : Inst.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+        FoundLDS = true;
+        break;
+      }
+    }
+    if (!FoundLDS) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
+    } else {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+    }
+  } else if (TII->isFLAT(Inst)) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+
+    // This is a flat memory operation. Check to see if it has memory
+    // tokens for both LDS and Memory, and if so mark it as a flat.
+    bool FoundLDSMem = false;
+    for (const MachineMemOperand *Memop : Inst.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
+        FoundLDSMem = true;
+    }
+
+    // This is a flat memory operation, so note it - it will require
+    // that both the VM and LGKM be flushed to zero if it is pending when
+    // a VM or LGKM dependency occurs.
+    if (FoundLDSMem) {
+      ScoreBrackets->setPendingFlat();
+    }
+  } else if (SIInstrInfo::isVMEM(Inst) &&
+             // TODO: get a better carve out.
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+        (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
+    }
+  } else if (TII->isSMRD(Inst)) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else {
+    switch (Inst.getOpcode()) {
+    case AMDGPU::S_SENDMSG:
+    case AMDGPU::S_SENDMSGHALT:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
+      break;
+    case AMDGPU::EXP:
+    case AMDGPU::EXP_DONE: {
+      int Imm = Inst.getOperand(0).getImm();
+      if (Imm >= 32 && Imm <= 63)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+      else if (Imm >= 12 && Imm <= 15)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+      else
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
+      break;
+    }
+    case AMDGPU::S_MEMTIME:
+    case AMDGPU::S_MEMREALTIME:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+      break;
+    default:
+      break;
+    }
+  }
+}
+
+void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
+  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+  int32_t MaxPending[NUM_INST_CNTS] = {0};
+  int32_t MaxFlat[NUM_INST_CNTS] = {0};
+  bool MixedExpTypes = false;
+
+  // Clear the score bracket state.
+  ScoreBrackets->clear();
+
+  // Compute the number of pending elements on block entry.
+
+  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
+  // need to handle single BBs with backedges to themselves. This means that
+  // they will need to retain and not clear their initial state.
+
+  // See if there are any uninitialized predecessors. If so, emit an
+  // s_waitcnt 0 at the beginning of the block.
+  for (MachineBasicBlock *pred : Block.predecessors()) {
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[pred].get();
+    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      break;
+    }
+    ForAllInstCounterType(t) {
+      int span =
+          PredScoreBrackets->getScoreUB(t) - PredScoreBrackets->getScoreLB(t);
+      MaxPending[t] = std::max(MaxPending[t], span);
+      span =
+          PredScoreBrackets->pendingFlat(t) - PredScoreBrackets->getScoreLB(t);
+      MaxFlat[t] = std::max(MaxFlat[t], span);
+    }
+
+    MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  // Also handle kills for exit block.
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      ForAllInstCounterType(T) {
+        int Span = KillWaitBrackets[I]->getScoreUB(T) -
+                   KillWaitBrackets[I]->getScoreLB(T);
+        MaxPending[T] = std::max(MaxPending[T], Span);
+        Span = KillWaitBrackets[I]->pendingFlat(T) -
+               KillWaitBrackets[I]->getScoreLB(T);
+        MaxFlat[T] = std::max(MaxFlat[T], Span);
+      }
+
+      MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
+    }
+  }
+
+  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      break;
+    }
+
+    int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
+                  PredScoreBrackets->getScoreLB(EXP_CNT);
+    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
+    int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
+                  PredScoreBrackets->getScoreLB(EXP_CNT);
+    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
+                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
+      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
+      int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
+                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
+      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
+    }
+  }
+
+#if 0
+  // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
+  // TODO: how does LC distinguish between function entry and main entry?
+  // If this is the entry to a function, force a wait.
+  MachineBasicBlock &Entry = Block.getParent()->front();
+  if (Entry.getNumber() == Block.getNumber()) {
+    ScoreBrackets->setWaitAtBeginning();
+    return;
+  }
+#endif
+
+  // Now set the current Block's brackets to the largest ending bracket.
+  ForAllInstCounterType(T) {
+    ScoreBrackets->setScoreUB(T, MaxPending[T]);
+    ScoreBrackets->setScoreLB(T, 0);
+    ScoreBrackets->setLastFlat(T, MaxFlat[T]);
+  }
+
+  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
+
+  // Set the register scoreboard.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+      break;
+    }
+
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+
+    // Now merge the gpr_reg_score information
+    ForAllInstCounterType(T) {
+      int PredLB = PredScoreBrackets->getScoreLB(T);
+      int PredUB = PredScoreBrackets->getScoreUB(T);
+      if (PredLB < PredUB) {
+        int PredScale = MaxPending[T] - PredUB;
+        // Merge vgpr scores.
+        for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
+          int PredRegScore = PredScoreBrackets->getRegScore(J, T);
+          if (PredRegScore <= PredLB)
+            continue;
+          int NewRegScore = PredScale + PredRegScore;
+          ScoreBrackets->setRegScore(
+              J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
+        }
+        // Also need to merge sgpr scores for lgkm_cnt.
+        if (T == LGKM_CNT) {
+          for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
+            int PredRegScore =
+                PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+            if (PredRegScore <= PredLB)
+              continue;
+            int NewRegScore = PredScale + PredRegScore;
+            ScoreBrackets->setRegScore(
+                J + NUM_ALL_VGPRS, LGKM_CNT,
+                std::max(
+                    ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
+                    NewRegScore));
+          }
+        }
+      }
+    }
+
+    // Also merge the WaitEvent information.
+    ForAllWaitEventType(W) {
+      enum InstCounterType T = PredScoreBrackets->eventCounter(W);
+      int PredEventUB = PredScoreBrackets->getEventUB(W);
+      if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
+        int NewEventUB =
+            MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
+        if (NewEventUB > 0) {
+          ScoreBrackets->setEventUB(
+              W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
+        }
+      }
+    }
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  // Set the register scoreboard.
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      // Now merge the gpr_reg_score information.
+      ForAllInstCounterType(T) {
+        int PredLB = KillWaitBrackets[I]->getScoreLB(T);
+        int PredUB = KillWaitBrackets[I]->getScoreUB(T);
+        if (PredLB < PredUB) {
+          int PredScale = MaxPending[T] - PredUB;
+          // Merge vgpr scores.
+          for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
+            int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
+            if (PredRegScore <= PredLB)
+              continue;
+            int NewRegScore = PredScale + PredRegScore;
+            ScoreBrackets->setRegScore(
+                J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
+          }
+          // Also need to merge sgpr scores for lgkm_cnt.
+          if (T == LGKM_CNT) {
+            for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
+              int PredRegScore =
+                  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+              if (PredRegScore <= PredLB)
+                continue;
+              int NewRegScore = PredScale + PredRegScore;
+              ScoreBrackets->setRegScore(
+                  J + NUM_ALL_VGPRS, LGKM_CNT,
+                  std::max(
+                      ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
+                      NewRegScore));
+            }
+          }
+        }
+      }
+
+      // Also merge the WaitEvent information.
+      ForAllWaitEventType(W) {
+        enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
+        int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
+        if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
+          int NewEventUB =
+              MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
+          if (NewEventUB > 0) {
+            ScoreBrackets->setEventUB(
+                W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
+          }
+        }
+      }
+    }
+  }
+
+  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
+  // sequencing predecessors, because changes to EXEC require waitcnts due to
+  // the delayed nature of these operations.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+      break;
+    }
+
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+
+    int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
+    if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
+      int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
+                       PredScoreBrackets->getScoreUB(EXP_CNT);
+      if (new_gds_ub > 0) {
+        ScoreBrackets->setEventUB(
+            GDS_GPR_LOCK,
+            std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
+      }
+    }
+    int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
+    if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
+      int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
+                       PredScoreBrackets->getScoreUB(EXP_CNT);
+      if (new_exp_ub > 0) {
+        ScoreBrackets->setEventUB(
+            EXP_GPR_LOCK,
+            std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
+      }
+    }
+  }
+}
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
+
+// Generate s_waitcnt instructions where needed.
+void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+                                            MachineBasicBlock &Block) {
+  // Initialize the state information.
+  mergeInputScoreBrackets(Block);
+
+  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+
+  DEBUG({
+    dbgs() << "Block" << Block.getNumber();
+    ScoreBrackets->dump();
+  });
+
+  bool InsertNOP = false;
+
+  // Walk over the instructions.
+  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
+       Iter != E;) {
+    MachineInstr &Inst = *Iter;
+    // Remove any previously existing waitcnts.
+    if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
+      // TODO: Register the old waitcnt and optimize the following waitcnts.
+      // Leaving the previously existing waitcnts is conservatively correct.
+      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
+          CompilerGeneratedWaitcntSet.end())
+        ++Iter;
+      else {
+        ScoreBrackets->setWaitcnt(&Inst);
+        ++Iter;
+        Inst.removeFromParent();
+      }
+      continue;
+    }
+
+    // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
+    // occurs before the instruction. Doing it here prevents any additional
+    // S_WAITCNTs from being emitted if the instruction was marked as
+    // requiring a WAITCNT beforehand.
+    if (Inst.getOpcode() == AMDGPU::S_BARRIER &&
+        ST->needWaitcntBeforeBarrier()) {
+      ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+      ScoreBrackets->updateByWait(LGKM_CNT,
+                                  ScoreBrackets->getScoreUB(LGKM_CNT));
+    }
+
+    // Kill instructions generate a conditional branch to the endmain block.
+    // Merge the current waitcnt state into the endmain block information.
+    // TODO: Are there other flavors of KILL instruction?
+    if (Inst.getOpcode() == AMDGPU::KILL) {
+      addKillWaitBracket(ScoreBrackets);
+    }
+
+    bool VCCZBugWorkAround = false;
+    if (readsVCCZ(Inst)) {
+      if (ScoreBrackets->getScoreLB(LGKM_CNT) <
+              ScoreBrackets->getScoreUB(LGKM_CNT) &&
+          ScoreBrackets->hasPendingSMEM()) {
+#if 0
+        // TODO: Enable this assert and fix the scheduler.
+        //       Shader Compiler assert is also independent of target.
+        // If you hit this, it most likely means that a S_LOAD_DWORDX was issued
+        // between a def of vcc and the consumer of vccz/vccnz. This is not
+        // expected to happen in practice because the 'wave_cf' phase (which is
+        // where all of the uses of vccz get generated) runs after
+        // the 'pre_ra_scheduler' phase.
+        assert(0 &&  !"SMRD instruction could retire during the live range of VCCZ, "
+	              "therefore Hardware Bug 457939 could be triggered" );
+#endif
+        if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+          VCCZBugWorkAround = true;
+      }
+    }
+
+    // Generate an s_waitcnt instruction to be placed before
+    // cur_Inst, if needed.
+    MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
+
+    if (SWaitInst) {
+      Block.insert(Inst, SWaitInst);
+      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
+        DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                     << "Old Instr: " << Inst << '\n'
+                     << "New Instr: " << *SWaitInst << '\n';);
+      }
+    }
+
+    updateEventWaitCntAfter(Inst, ScoreBrackets);
+
+#if 0 // TODO: implement resource type check controlled by options with ub = LB.
+    // If this instruction generates a S_SETVSKIP because it is an
+    // indexed resource, and we are on Tahiti, then it will also force
+    // an S_WAITCNT vmcnt(0)
+    if (RequireCheckResourceType(Inst, context)) {
+      // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
+      ScoreBrackets->setScoreLB(VM_CNT,
+				   ScoreBrackets->getScoreUB(VM_CNT));
+    }
+#endif
+
+    ScoreBrackets->clearWaitcnt();
+
+    if (SWaitInst) {
+      DEBUG({ SWaitInst->print(dbgs() << '\n'); });
+    }
+    DEBUG({
+      Inst.print(dbgs());
+      ScoreBrackets->dump();
+    });
+
+    // Check to see if this is a GWS instruction. If so, and if this is CI or
+    // VI, then the generated code sequence will include an S_WAITCNT 0.
+    // TODO: Are these the only GWS instructions?
+    if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
+      // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
+      ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+      ScoreBrackets->updateByWait(LGKM_CNT,
+                                  ScoreBrackets->getScoreUB(LGKM_CNT));
+    }
+
+    // TODO: Remove this work-around after fixing the scheduler and enable the
+    // assert above.
+    if (VCCZBugWorkAround) {
+      // Restore the vccz bit.  Any time a value is written to vcc, the vcc
+      // bit is updated, so we can restore the bit by reading the value of
+      // vcc and then writing it back to the register.
+      BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+              AMDGPU::VCC)
+          .addReg(AMDGPU::VCC);
+    }
+
+    if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+
+      // This avoids a s_nop after a waitcnt has just been inserted.
+      if (!SWaitInst && InsertNOP)
+        BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+      InsertNOP = false;
+
+      // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
+      // or SMEM clause, respectively.
+      //
+      // The temporary workaround is to break the clauses with S_NOP.
+      //
+      // The proper solution would be to allocate registers such that all source
+      // and destination registers don't overlap, e.g. this is illegal:
+      //   r0 = load r2
+      //   r2 = load r0
+      bool IsSMEM = false;
+      bool IsVMEM = false;
+      if (TII->isSMRD(Inst))
+        IsSMEM = true;
+      else if (TII->usesVM_CNT(Inst))
+        IsVMEM = true;
+
+      ++Iter;
+      if (Iter == E)
+        break;
+
+      MachineInstr &Next = *Iter;
+
+      // TODO: How about consecutive SMEM instructions?
+      //       The comments above says break the clause but the code does not.
+      // if ((TII->isSMRD(next) && isSMEM) ||
+      if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
+          // TODO: Enable this check when hasSoftClause is upstreamed.
+          // ST->hasSoftClauses() &&
+          ST->isXNACKEnabled()) {
+        // Insert a NOP to break the clause.
+        InsertNOP = true;
+        continue;
+      }
+
+      // There must be "S_NOP 0" between an instruction writing M0 and
+      // S_SENDMSG.
+      if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
+           Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+          Inst.definesRegister(AMDGPU::M0))
+        InsertNOP = true;
+
+      continue;
+    }
+
+    ++Iter;
+  }
+
+  // Check if we need to force convergence at loop footer.
+  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
+  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+    LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+    WaitcntData->print();
+    DEBUG(dbgs() << '\n';);
+
+    // The iterative waitcnt insertion algorithm aims for optimal waitcnt
+    // placement and doesn't always guarantee convergence for a loop. Each
+    // loop should take at most 2 iterations for it to converge naturally.
+    // When this max is reached and result doesn't converge, we force
+    // convergence by inserting a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > 2) {
+      // To ensure convergence, need to make wait events at loop footer be no
+      // more than those from the previous iteration.
+      // As a simplification, Instead of tracking individual scores and
+      // generate the precise wait count, just wait on 0.
+      bool HasPending = false;
+      MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
+      ForAllInstCounterType(T) {
+        if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
+          ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+          HasPending = true;
+        }
+      }
+
+      if (HasPending) {
+        if (!SWaitInst) {
+          SWaitInst = Block.getParent()->CreateMachineInstr(
+              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
+          CompilerGeneratedWaitcntSet.insert(SWaitInst);
+          const MachineOperand &Op = MachineOperand::CreateImm(0);
+          SWaitInst->addOperand(MF, Op);
+#if 0 // TODO: Format the debug output
+          OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
+          OutputTransformAdd(SWaitInst, context);
+#endif
+        }
+#if 0 // TODO: ??
+        _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
+#endif
+      }
+
+      if (SWaitInst) {
+        DEBUG({
+          SWaitInst->print(dbgs());
+          dbgs() << "\nAdjusted score board:";
+          ScoreBrackets->dump();
+        });
+
+        // Add this waitcnt to the block. It is either newly created or
+        // created in previous iterations and added back since block traversal
+        // always remove waitcnt.
+        insertWaitcntBeforeCF(Block, SWaitInst);
+        WaitcntData->setWaitcnt(SWaitInst);
+      }
+    }
+  }
+}
+
+bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+
+  AMDGPU::IsaInfo::IsaVersion IV =
+      AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
+  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
+  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+  
+  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
+  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
+  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+
+  // Walk over the blocks in reverse post-dominator order, inserting
+  // s_waitcnt where needed.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  bool Modified = false;
+  for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+           I = RPOT.begin(),
+           E = RPOT.end(), J = RPOT.begin();
+       I != E;) {
+    MachineBasicBlock &MBB = **I;
+
+    BlockVisitedSet.insert(&MBB);
+
+    BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
+    if (!ScoreBrackets) {
+      BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>();
+      ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
+    }
+    ScoreBrackets->setPostOrder(MBB.getNumber());
+    MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
+    if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
+      LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>();
+
+    // If we are walking into the block from before the loop, then guarantee
+    // at least 1 re-walk over the loop to propagate the information, even if
+    // no S_WAITCNT instructions were generated.
+    if (ContainingLoop && ContainingLoop->getTopBlock() == &MBB && J < I &&
+        (BlockWaitcntProcessedSet.find(&MBB) ==
+         BlockWaitcntProcessedSet.end())) {
+      BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+      DEBUG(dbgs() << "set-revisit: block"
+                   << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+    }
+
+    // Walk over the instructions.
+    insertWaitcntInBlock(MF, MBB);
+
+    // Flag that waitcnts have been processed at least once.
+    BlockWaitcntProcessedSet.insert(&MBB);
+
+    // See if we want to revisit the loop.
+    if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+      MachineBasicBlock *EntryBB = ContainingLoop->getTopBlock();
+      BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
+      if (EntrySB && EntrySB->getRevisitLoop()) {
+        EntrySB->setRevisitLoop(false);
+        J = I;
+        int32_t PostOrder = EntrySB->getPostOrder();
+        // TODO: Avoid this loop. Find another way to set I.
+        for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+                 X = RPOT.begin(),
+                 Y = RPOT.end();
+             X != Y; ++X) {
+          MachineBasicBlock &MBBX = **X;
+          if (MBBX.getNumber() == PostOrder) {
+            I = X;
+            break;
+          }
+        }
+        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+        WaitcntData->incIterCnt();
+        DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+        continue;
+      } else {
+        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+        // Loop converged, reset iteration count. If this loop gets revisited,
+        // it must be from an outer loop, the counter will restart, this will
+        // ensure we don't force convergence on such revisits.
+        WaitcntData->resetIterCnt();
+      }
+    }
+
+    J = I;
+    ++I;
+  }
+
+  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+  bool HaveScalarStores = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+         ++I) {
+
+      if (!HaveScalarStores && TII->isScalarStore(*I))
+        HaveScalarStores = true;
+
+      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+        EndPgmBlocks.push_back(&MBB);
+    }
+  }
+
+  if (HaveScalarStores) {
+    // If scalar writes are used, the cache must be flushed or else the next
+    // wave to reuse the same scratch memory can be clobbered.
+    //
+    // Insert s_dcache_wb at wave termination points if there were any scalar
+    // stores, and only if the cache hasn't already been flushed. This could be
+    // improved by looking across blocks for flushes in postdominating blocks
+    // from the stores but an explicitly requested flush is probably very rare.
+    for (MachineBasicBlock *MBB : EndPgmBlocks) {
+      bool SeenDCacheWB = false;
+
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+           ++I) {
+
+        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+          SeenDCacheWB = true;
+        else if (TII->isScalarStore(*I))
+          SeenDCacheWB = false;
+
+        // FIXME: It would be better to insert this before a waitcnt if any.
+        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+            !SeenDCacheWB) {
+          Modified = true;
+          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
Index: test/CodeGen/AMDGPU/basic-branch.ll
===================================================================
--- test/CodeGen/AMDGPU/basic-branch.ll
+++ test/CodeGen/AMDGPU/basic-branch.ll
@@ -34,8 +34,6 @@
 ; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
 
 ; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
Index: test/CodeGen/AMDGPU/branch-condition-and.ll
===================================================================
--- test/CodeGen/AMDGPU/branch-condition-and.ll
+++ test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -19,9 +19,8 @@
 
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
 ; GCN: ds_write_b32
-; GCN: s_waitcnt
 
-; GCN-NEXT: [[BB5]]
+; GCN: [[BB5]]
 ; GCN: s_or_b64 exec, exec
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end
Index: test/CodeGen/AMDGPU/branch-relaxation.ll
===================================================================
--- test/CodeGen/AMDGPU/branch-relaxation.ll
+++ test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -223,7 +223,6 @@
 ; GCN-NEXT: [[BB2]]: ; %bb2
 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
 ; GCN: buffer_store_dword [[BB2_K]]
-; GCN: s_waitcnt vmcnt(0)
 
 ; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
 ; GCN-NEXT: s_getpc_b64 vcc
@@ -393,7 +392,6 @@
 
 ; GCN-NEXT: ; BB#2: ; %if_uniform
 ; GCN: buffer_store_dword
-; GCN: s_waitcnt vmcnt(0)
 
 ; GCN-NEXT: [[ENDIF]]: ; %endif
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
Index: test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
===================================================================
--- test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -37,22 +37,21 @@
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN: mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: {{^}}BB{{[0-9]+}}_1: ; %if
 ; GCN: s_mov_b32 m0, -1
 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]
+; GCN: s_waitcnt lgkmcnt(0)
 ; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
-; GCN: s_waitcnt vmcnt(0)
 
 ; Spill val register
 ; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]]
 ; GCN: buffer_store_dword [[VAL]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: s_waitcnt vmcnt(0)
 
 ; VMEM: [[ENDIF]]:
 ; Reload and restore exec mask
+; VGPR: s_waitcnt lgkmcnt(0)
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -119,7 +118,6 @@
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: ; mask branch [[END:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execz [[END]]
 
@@ -130,7 +128,6 @@
 ; GCN: v_cmp_ne_u32_e32 vcc,
 ; GCN: s_and_b64 vcc, exec, vcc
 ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: s_cbranch_vccnz [[LOOP]]
 
 
@@ -197,7 +194,6 @@
 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, [[CMP0]]
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 
 ; FIXME: It makes no sense to put this skip here
 ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
@@ -235,7 +231,6 @@
 
 ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF]]
 
@@ -245,14 +240,12 @@
 ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ELSE]]: ; %else
 ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: s_branch [[FLOW]]
 
 ; GCN: [[ENDIF]]:
Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll
===================================================================
--- test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -120,8 +120,7 @@
 ; FIXME: The waitcnt for the argument load can go after the loop
 ; IDXMODE: s_set_gpr_idx_on 0, src0
 ; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
-; GCN: s_waitcnt lgkmcnt(0)
-
+; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
 
 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
@@ -250,8 +249,6 @@
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
 
 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-; GCN: s_waitcnt lgkmcnt(0)
-
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
 
@@ -290,7 +287,6 @@
 ; IDXMODE: s_set_gpr_idx_on 0, dst
 
 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-; GCN: s_waitcnt lgkmcnt(0)
 
 ; The offset depends on the register that holds the first element of the vector.
 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
@@ -330,9 +326,9 @@
 ; IDXMODE: s_set_gpr_idx_on 0, src0
 
 ; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
-; GCN: s_waitcnt vmcnt(0)
 
 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
 
@@ -411,6 +407,7 @@
 ; IDXMODE: s_set_gpr_idx_on 0, dst
 
 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
 
Index: test/CodeGen/AMDGPU/infinite-loop.ll
===================================================================
--- test/CodeGen/AMDGPU/infinite-loop.ll
+++ test/CodeGen/AMDGPU/infinite-loop.ll
@@ -4,8 +4,8 @@
 ; SI-LABEL: {{^}}infinite_loop:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
 ; SI: BB0_1:
+; SI: s_waitcnt lgkmcnt(0)
 ; SI: buffer_store_dword [[REG]]
-; SI: s_waitcnt vmcnt(0) expcnt(0)
 ; SI: s_branch BB0_1
 define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
@@ -58,7 +58,7 @@
 ;
 ;CHECK-LABEL: {{^}}buffer_store_wait:
 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: s_waitcnt expcnt(0)
 ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@@ -58,7 +58,7 @@
 ;
 ;CHECK-LABEL: {{^}}buffer_store_wait:
 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: s_waitcnt expcnt(0)
 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -130,7 +130,7 @@
 ;
 ; GCN-LABEL: {{^}}image_store_wait:
 ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
+; GCN: s_waitcnt expcnt(0)
 ; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -20,7 +20,7 @@
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
 define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
-  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 127)
   br label %end
 
 end:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -20,7 +20,7 @@
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
 define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
-  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 127)
   br label %end
 
 end:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -18,7 +18,7 @@
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
 define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
-  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 127)
   br label %end
 
 end:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -18,7 +18,7 @@
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
 define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
-  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 127)
   br label %end
 
 end:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -18,8 +18,8 @@
 ;
 ; CHECK-LABEL: {{^}}test2:
 ; CHECK: image_load
-; CHECK-NOT: s_waitcnt vmcnt(0){{$}}
-; CHECK: s_waitcnt
+; CHECK-NEXT: s_waitcnt
+; CHECK: s_waitcnt vmcnt(0){{$}}
 ; CHECK-NEXT: image_store
 define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) {
   %t = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
===================================================================
--- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -362,6 +362,7 @@
 
 ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT: ; return
 
 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
Index: test/CodeGen/AMDGPU/ret_jump.ll
===================================================================
--- test/CodeGen/AMDGPU/ret_jump.ll
+++ test/CodeGen/AMDGPU/ret_jump.ll
@@ -65,7 +65,6 @@
 
 ; GCN-NEXT:  ; %unreachable.bb
 ; GCN: ds_write_b32
-; GCN: s_waitcnt
 ; GCN: ; divergent unreachable
 
 ; GCN: ; %ret.bb
@@ -73,6 +72,7 @@
 
 ; GCN: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_waitcnt 
 ; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end
 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
Index: test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
===================================================================
--- test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -9,7 +9,6 @@
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
 ; GCN: ; divergent unreachable
-; GCN: s_waitcnt
 
 ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_or_b64 exec, exec
@@ -38,7 +37,6 @@
 ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
 ; GCN: ; divergent unreachable
-; GCN: s_waitcnt
 
 ; GCN: [[RETURN]]:
 ; GCN-NEXT: s_or_b64 exec, exec
@@ -66,7 +64,6 @@
 
 ; GCN: [[UNREACHABLE]]:
 ; GCN: ds_write_b32
-; GCN: s_waitcnt
 define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
 bb:
   %tmp63 = icmp eq i32 %arg0, 32
Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -5,7 +5,7 @@
 ; GCN-FUNC: {{^}}vccz_workaround:
 ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0
 ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0{{$}}
-; GCN: s_waitcnt lgkmcnt(0)
+; VCCZ-BUG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VCCZ-BUG: s_mov_b64 vcc, vcc
 ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
 ; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
Index: test/CodeGen/AMDGPU/spill-m0.ll
===================================================================
--- test/CodeGen/AMDGPU/spill-m0.ll
+++ test/CodeGen/AMDGPU/spill-m0.ll
@@ -18,13 +18,11 @@
 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
 ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill
-; TOVMEM: s_waitcnt vmcnt(0)
 
 ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; TOSMEM-NOT: [[M0_COPY]]
 ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill
-; TOSMEM: s_waitcnt lgkmcnt(0)
 
 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 
Index: test/CodeGen/AMDGPU/valu-i1.ll
===================================================================
--- test/CodeGen/AMDGPU/valu-i1.ll
+++ test/CodeGen/AMDGPU/valu-i1.ll
@@ -11,7 +11,6 @@
 ; SI: v_cmp_lt_i32_e32 vcc, 0,
 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
-; SI-NEXT: s_waitcnt lgkmcnt(0)
 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
 
@@ -72,7 +71,6 @@
 
 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
-; SI-NEXT: s_waitcnt
 
 ; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
@@ -101,7 +99,6 @@
 
 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
-; SI-NEXT: s_waitcnt
 
 ; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
@@ -132,7 +129,6 @@
 
 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
 ; SI: ds_write_b32
-; SI: s_waitcnt
 
 ; SI-NEXT: {{^}}[[FLOW]]:
 ; SI-NEXT: s_or_saveexec_b64
@@ -140,8 +136,8 @@
 ; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
 
 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
-; SI: buffer_store_dword
-; SI-NEXT: s_waitcnt
+; SI: s_waitcnt
+; SI-NEXT: buffer_store_dword
 
 ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
 ; SI: s_or_b64 exec, exec