Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
@@ -50,7 +50,6 @@
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
-FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIFixWWMLivenessPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
@@ -169,9 +168,6 @@
 void initializeSIDebuggerInsertNopsPass(PassRegistry&);
 extern char &SIDebuggerInsertNopsID;
 
-void initializeSIInsertWaitsPass(PassRegistry&);
-extern char &SIInsertWaitsID;
-
 void initializeSIInsertWaitcntsPass(PassRegistry&);
 extern char &SIInsertWaitcntsID;
 
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -168,7 +168,6 @@
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
-  initializeSIInsertWaitsPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
@@ -876,10 +875,7 @@
   addPass(&PostRAHazardRecognizerID);
 
   addPass(createSIMemoryLegalizerPass());
-  if (EnableSIInsertWaitcntsPass)
-    addPass(createSIInsertWaitcntsPass());
-  else
-    addPass(createSIInsertWaitsPass());
+  addPass(createSIInsertWaitcntsPass());
   addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
   addPass(createSIDebuggerInsertNopsPass());
Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
@@ -82,7 +82,6 @@
   SIFrameLowering.cpp
   SIInsertSkips.cpp
   SIInsertWaitcnts.cpp
-  SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILoadStoreOptimizer.cpp
Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -386,7 +386,9 @@
 
   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
 
-  bool ForceEmitZeroWaitcnt;
+  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
+  // because of amdgpu-waitcnt-forcezero flag
+  bool ForceEmitZeroWaitcnts;
   bool ForceEmitWaitcnt[NUM_INST_CNTS];
 
 public:
@@ -881,14 +883,18 @@
   // To emit, or not to emit - that's the question!
   // Start with an assumption that there is no need to emit.
   unsigned int EmitWaitcnt = 0;
+
   // No need to wait before phi. If a phi-move exists, then the wait should
   // has been inserted before the move. If a phi-move does not exist, then
   // wait should be inserted before the real use. The same is true for
   // sc-merge. It is not a coincident that all these cases correspond to the
   // instructions that are skipped in the assembling loop.
   bool NeedLineMapping = false; // TODO: Check on this.
-  setForceEmitWaitcnt();
 
+  // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
+  bool ForceEmitZeroWaitcnt = false;
+
+  setForceEmitWaitcnt();
   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
 
   if (MI.isDebugValue() &&
@@ -1128,6 +1134,9 @@
       // block, so if we only wait on LGKM here, we might end up with
       // another s_waitcnt inserted right after this if there are non-LGKM
       // instructions still outstanding.
+      // FIXME: this is too conservative / the comment is wrong.
+      // We don't wait on everything at the end of the block and we combine
+      // waitcnts so we should never have back-to-back waitcnts.
       ForceEmitZeroWaitcnt = true;
       EmitWaitcnt = true;
     }
@@ -1138,7 +1147,7 @@
     int CntVal[NUM_INST_CNTS];
 
     bool UseDefaultWaitcntStrategy = true;
-    if (ForceEmitZeroWaitcnt) {
+    if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
       // Force all waitcnts to 0.
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
@@ -1232,7 +1241,7 @@
       }
       if (insertSWaitInst) {
         if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
-          if (ForceEmitZeroWaitcnt)
+          if (ForceEmitZeroWaitcnts)
             DEBUG(dbgs() << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
           if (IsForceEmitWaitcnt)
             DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n");
@@ -1828,7 +1837,7 @@
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   AMDGPUASI = ST->getAMDGPUAS();
 
-  ForceEmitZeroWaitcnt = ForceEmitZeroFlag;
+  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
        T = (enum InstCounterType)(T + 1))
     ForceEmitWaitcnt[T] = false;
Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -1,703 +0,0 @@
-//===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Insert wait instructions for memory reads and writes.
-///
-/// Memory reads and writes are issued asynchronously, so we need to insert
-/// S_WAITCNT instructions when we want to access any of their results or
-/// overwrite any register that's used asynchronously.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <utility>
-
-#define DEBUG_TYPE "si-insert-waits"
-
-using namespace llvm;
-
-namespace {
-
-/// One variable for each of the hardware counters
-using Counters = union {
-  struct {
-    unsigned VM;
-    unsigned EXP;
-    unsigned LGKM;
-  } Named;
-  unsigned Array[3];
-};
-
-using InstType = enum {
-  OTHER,
-  SMEM,
-  VMEM
-};
-
-using RegCounters =  Counters[512];
-using RegInterval = std::pair<unsigned, unsigned>;
-
-class SIInsertWaits : public MachineFunctionPass {
-private:
-  const SISubtarget *ST = nullptr;
-  const SIInstrInfo *TII = nullptr;
-  const SIRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI;
-  AMDGPU::IsaInfo::IsaVersion ISA;
-
-  /// Constant zero value
-  static const Counters ZeroCounts;
-
-  /// Hardware limits
-  Counters HardwareLimits;
-
-  /// Counter values we have already waited on.
-  Counters WaitedOn;
-
-  /// Counter values that we must wait on before the next counter
-  /// increase.
-  Counters DelayedWaitOn;
-
-  /// Counter values for last instruction issued.
-  Counters LastIssued;
-
-  /// Registers used by async instructions.
-  RegCounters UsedRegs;
-
-  /// Registers defined by async instructions.
-  RegCounters DefinedRegs;
-
-  /// Different export instruction types seen since last wait.
-  unsigned ExpInstrTypesSeen = 0;
-
-  /// Type of the last opcode.
-  InstType LastOpcodeType;
-
-  bool LastInstWritesM0;
-
-  /// Whether or not we have flat operations outstanding.
-  bool IsFlatOutstanding;
-
-  /// Whether the machine function returns void
-  bool ReturnsVoid;
-
-  /// Whether the VCCZ bit is possibly corrupt
-  bool VCCZCorrupt = false;
-
-  /// Get increment/decrement amount for this instruction.
-  Counters getHwCounts(MachineInstr &MI);
-
-  /// Is operand relevant for async execution?
-  bool isOpRelevant(MachineOperand &Op);
-
-  /// Get register interval an operand affects.
-  RegInterval getRegInterval(const TargetRegisterClass *RC,
-                             const MachineOperand &Reg) const;
-
-  /// Handle instructions async components
-  void pushInstruction(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I,
-                       const Counters& Increment);
-
-  /// Insert the actual wait instruction
-  bool insertWait(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator I,
-                  const Counters &Counts);
-
-  /// Handle existing wait instructions (from intrinsics)
-  void handleExistingWait(MachineBasicBlock::iterator I);
-
-  /// Do we need def2def checks?
-  bool unorderedDefines(MachineInstr &MI);
-
-  /// Resolve all operand dependencies to counter requirements
-  Counters handleOperands(MachineInstr &MI);
-
-  /// Insert S_NOP between an instruction writing M0 and S_SENDMSG.
-  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
-
-  /// Return true if there are LGKM instrucitons that haven't been waited on
-  /// yet.
-  bool hasOutstandingLGKM() const;
-
-public:
-  static char ID;
-
-  SIInsertWaits() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override {
-    return "SI insert wait instructions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // end anonymous namespace
-
-INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
-                      "SI Insert Waits", false, false)
-INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
-                    "SI Insert Waits", false, false)
-
-char SIInsertWaits::ID = 0;
-
-char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
-
-FunctionPass *llvm::createSIInsertWaitsPass() {
-  return new SIInsertWaits();
-}
-
-const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
-
-static bool readsVCCZ(const MachineInstr &MI) {
-  unsigned Opc = MI.getOpcode();
-  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
-         !MI.getOperand(1).isUndef();
-}
-
-bool SIInsertWaits::hasOutstandingLGKM() const {
-  return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
-}
-
-Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
-  uint64_t TSFlags = MI.getDesc().TSFlags;
-  Counters Result = { { 0, 0, 0 } };
-
-  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
-
-  // Only consider stores or EXP for EXP_CNT
-  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
-
-  // LGKM may uses larger values
-  if (TSFlags & SIInstrFlags::LGKM_CNT) {
-
-    if (TII->isSMRD(MI)) {
-
-      if (MI.getNumOperands() != 0) {
-        assert(MI.getOperand(0).isReg() &&
-               "First LGKM operand must be a register!");
-
-        // XXX - What if this is a write into a super register?
-        const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
-        unsigned Size = TRI->getRegSizeInBits(*RC);
-        Result.Named.LGKM = Size > 32 ? 2 : 1;
-      } else {
-        // s_dcache_inv etc. do not have a destination register. Assume we
-        // want a wait on these.
-        // XXX - What is the right value?
-        Result.Named.LGKM = 1;
-      }
-    } else {
-      // DS
-      Result.Named.LGKM = 1;
-    }
-
-  } else {
-    Result.Named.LGKM = 0;
-  }
-
-  return Result;
-}
-
-bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
-  // Constants are always irrelevant
-  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-    return false;
-
-  // Defines are always relevant
-  if (Op.isDef())
-    return true;
-
-  // For exports all registers are relevant.
-  // TODO: Skip undef/disabled registers.
-  MachineInstr &MI = *Op.getParent();
-  if (TII->isEXP(MI))
-    return true;
-
-  // For stores the stored value is also relevant
-  if (!MI.getDesc().mayStore())
-    return false;
-
-  // Check if this operand is the value being stored.
-  // Special case for DS/FLAT instructions, since the address
-  // operand comes before the value operand and it may have
-  // multiple data operands.
-
-  if (TII->isDS(MI)) {
-    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
-    if (Data0 && Op.isIdenticalTo(*Data0))
-      return true;
-
-    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
-    return Data1 && Op.isIdenticalTo(*Data1);
-  }
-
-  if (TII->isFLAT(MI)) {
-    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
-    if (Data && Op.isIdenticalTo(*Data))
-      return true;
-  }
-
-  // NOTE: This assumes that the value operand is before the
-  // address operand, and that there is only one value operand.
-  for (MachineInstr::mop_iterator I = MI.operands_begin(),
-       E = MI.operands_end(); I != E; ++I) {
-
-    if (I->isReg() && I->isUse())
-      return Op.isIdenticalTo(*I);
-  }
-
-  return false;
-}
-
-RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
-                                          const MachineOperand &Reg) const {
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-  assert(Size >= 32);
-
-  RegInterval Result;
-  Result.first = TRI->getEncodingValue(Reg.getReg());
-  Result.second = Result.first + Size / 32;
-
-  return Result;
-}
-
-void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I,
-                                    const Counters &Increment) {
-  // Get the hardware counter increments and sum them up
-  Counters Limit = ZeroCounts;
-  unsigned Sum = 0;
-
-  if (TII->mayAccessFlatAddressSpace(*I))
-    IsFlatOutstanding = true;
-
-  for (unsigned i = 0; i < 3; ++i) {
-    LastIssued.Array[i] += Increment.Array[i];
-    if (Increment.Array[i])
-      Limit.Array[i] = LastIssued.Array[i];
-    Sum += Increment.Array[i];
-  }
-
-  // If we don't increase anything then that's it
-  if (Sum == 0) {
-    LastOpcodeType = OTHER;
-    return;
-  }
-
-  if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
-    // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
-    // or SMEM clause, respectively.
-    //
-    // The temporary workaround is to break the clauses with S_NOP.
-    //
-    // The proper solution would be to allocate registers such that all source
-    // and destination registers don't overlap, e.g. this is illegal:
-    //   r0 = load r2
-    //   r2 = load r0
-    if (LastOpcodeType == VMEM && Increment.Named.VM) {
-      // Insert a NOP to break the clause.
-      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
-          .addImm(0);
-      LastInstWritesM0 = false;
-    }
-
-    if (TII->isSMRD(*I))
-      LastOpcodeType = SMEM;
-    else if (Increment.Named.VM)
-      LastOpcodeType = VMEM;
-  }
-
-  // Remember which export instructions we have seen
-  if (Increment.Named.EXP) {
-    ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
-  }
-
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-    MachineOperand &Op = I->getOperand(i);
-    if (!isOpRelevant(Op))
-      continue;
-
-    const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
-    RegInterval Interval = getRegInterval(RC, Op);
-    for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
-      // Remember which registers we define
-      if (Op.isDef())
-        DefinedRegs[j] = Limit;
-
-      // and which one we are using
-      if (Op.isUse())
-        UsedRegs[j] = Limit;
-    }
-  }
-}
-
-bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I,
-                               const Counters &Required) {
-  // End of program? No need to wait on anything
-  // A function not returning void needs to wait, because other bytecode will
-  // be appended after it and we don't know what it will be.
-  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
-    return false;
-
-  // Figure out if the async instructions execute in order
-  bool Ordered[3];
-
-  // VM_CNT is always ordered except when there are flat instructions, which
-  // can return out of order.
-  Ordered[0] = !IsFlatOutstanding;
-
-  // EXP_CNT is unordered if we have both EXP & VM-writes
-  Ordered[1] = ExpInstrTypesSeen == 3;
-
-  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
-  Ordered[2] = false;
-
-  // The values we are going to put into the S_WAITCNT instruction
-  Counters Counts = HardwareLimits;
-
-  // Do we really need to wait?
-  bool NeedWait = false;
-
-  for (unsigned i = 0; i < 3; ++i) {
-    if (Required.Array[i] <= WaitedOn.Array[i])
-      continue;
-
-    NeedWait = true;
-
-    if (Ordered[i]) {
-      unsigned Value = LastIssued.Array[i] - Required.Array[i];
-
-      // Adjust the value to the real hardware possibilities.
-      Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
-    } else
-      Counts.Array[i] = 0;
-
-    // Remember on what we have waited on.
-    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
-  }
-
-  if (!NeedWait)
-    return false;
-
-  // Reset EXP_CNT instruction types
-  if (Counts.Named.EXP == 0)
-    ExpInstrTypesSeen = 0;
-
-  // Build the wait instruction
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-    .addImm(AMDGPU::encodeWaitcnt(ISA,
-                                  Counts.Named.VM,
-                                  Counts.Named.EXP,
-                                  Counts.Named.LGKM));
-
-  LastOpcodeType = OTHER;
-  LastInstWritesM0 = false;
-  IsFlatOutstanding = false;
-  return true;
-}
-
-/// helper function for handleOperands
-static void increaseCounters(Counters &Dst, const Counters &Src) {
-  for (unsigned i = 0; i < 3; ++i)
-    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
-}
-
-/// check whether any of the counters is non-zero
-static bool countersNonZero(const Counters &Counter) {
-  for (unsigned i = 0; i < 3; ++i)
-    if (Counter.Array[i])
-      return true;
-  return false;
-}
-
-void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
-  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
-
-  unsigned Imm = I->getOperand(0).getImm();
-  Counters Counts, WaitOn;
-
-  Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
-  Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
-  Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
-
-  for (unsigned i = 0; i < 3; ++i) {
-    if (Counts.Array[i] <= LastIssued.Array[i])
-      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
-    else
-      WaitOn.Array[i] = 0;
-  }
-
-  increaseCounters(DelayedWaitOn, WaitOn);
-}
-
-Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
-  Counters Result = ZeroCounts;
-
-  // For each register affected by this instruction increase the result
-  // sequence.
-  //
-  // TODO: We could probably just look at explicit operands if we removed VCC /
-  // EXEC from SMRD dest reg classes.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &Op = MI.getOperand(i);
-    if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-      continue;
-
-    const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
-    RegInterval Interval = getRegInterval(RC, Op);
-    for (unsigned j = Interval.first; j < Interval.second; ++j) {
-      if (Op.isDef()) {
-        increaseCounters(Result, UsedRegs[j]);
-        increaseCounters(Result, DefinedRegs[j]);
-      }
-
-      if (Op.isUse())
-        increaseCounters(Result, DefinedRegs[j]);
-    }
-  }
-
-  return Result;
-}
-
-void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) {
-  if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
-    return;
-
-  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
-  if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
-    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
-    LastInstWritesM0 = false;
-    return;
-  }
-
-  // Set whether this instruction sets M0
-  LastInstWritesM0 = false;
-
-  unsigned NumOperands = I->getNumOperands();
-  for (unsigned i = 0; i < NumOperands; i++) {
-    const MachineOperand &Op = I->getOperand(i);
-
-    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
-      LastInstWritesM0 = true;
-  }
-}
-
-/// Return true if \p MBB has one successor immediately following, and is its
-/// only predecessor
-static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
-  if (MBB.succ_size() != 1)
-    return false;
-
-  const MachineBasicBlock *Succ = *MBB.succ_begin();
-  return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
-}
-
-// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
-// around other non-memory instructions.
-bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
-  bool Changes = false;
-
-  ST = &MF.getSubtarget<SISubtarget>();
-  TII = ST->getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-  MRI = &MF.getRegInfo();
-  ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
-  HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
-  HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
-
-  WaitedOn = ZeroCounts;
-  DelayedWaitOn = ZeroCounts;
-  LastIssued = ZeroCounts;
-  LastOpcodeType = OTHER;
-  LastInstWritesM0 = false;
-  IsFlatOutstanding = false;
-  ReturnsVoid = MFI->returnsVoid();
-
-  memset(&UsedRegs, 0, sizeof(UsedRegs));
-  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
-
-  SmallVector<MachineInstr *, 4> RemoveMI;
-  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-
-  bool HaveScalarStores = false;
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-    MachineBasicBlock &MBB = *BI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (!HaveScalarStores && TII->isScalarStore(*I))
-        HaveScalarStores = true;
-
-      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
-        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
-        // vccz bit, so when we detect that an instruction may read from a
-        // corrupt vccz bit, we need to:
-        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
-        //    complete.
-        // 2. Restore the correct value of vccz by writing the current value
-        //    of vcc back to vcc.
-
-        if (TII->isSMRD(I->getOpcode())) {
-          VCCZCorrupt = true;
-        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
-          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
-          // Whenever we store a value in vcc, the correct value of vccz is
-          // restored.
-          VCCZCorrupt = false;
-        }
-
-        // Check if we need to apply the bug work-around
-        if (VCCZCorrupt && readsVCCZ(*I)) {
-          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
-
-          // Wait on everything, not just LGKM.  vccz reads usually come from
-          // terminators, and we always wait on everything at the end of the
-          // block, so if we only wait on LGKM here, we might end up with
-          // another s_waitcnt inserted right after this if there are non-LGKM
-          // instructions still outstanding.
-          insertWait(MBB, I, LastIssued);
-
-          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
-          // bit is updated, so we can restore the bit by reading the value of
-          // vcc and then writing it back to the register.
-          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
-                  AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
-        }
-      }
-
-      // Record pre-existing, explicitly requested waits
-      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
-        handleExistingWait(*I);
-        RemoveMI.push_back(&*I);
-        continue;
-      }
-
-      Counters Required;
-
-      // Wait for everything before a barrier.
-      //
-      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
-      // but we also want to wait for any other outstanding transfers before
-      // signalling other hardware blocks
-      if ((I->getOpcode() == AMDGPU::S_BARRIER &&
-               !ST->hasAutoWaitcntBeforeBarrier()) ||
-           I->getOpcode() == AMDGPU::S_SENDMSG ||
-           I->getOpcode() == AMDGPU::S_SENDMSGHALT)
-        Required = LastIssued;
-      else
-        Required = handleOperands(*I);
-
-      Counters Increment = getHwCounts(*I);
-
-      if (countersNonZero(Required) || countersNonZero(Increment))
-        increaseCounters(Required, DelayedWaitOn);
-
-      Changes |= insertWait(MBB, I, Required);
-
-      pushInstruction(MBB, I, Increment);
-      handleSendMsg(MBB, I);
-
-      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
-          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
-        EndPgmBlocks.push_back(&MBB);
-    }
-
-    // Wait for everything at the end of the MBB. If there is only one
-    // successor, we can defer this until the uses there.
-    if (!hasTrivialSuccessor(MBB))
-      Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
-  }
-
-  if (HaveScalarStores) {
-    // If scalar writes are used, the cache must be flushed or else the next
-    // wave to reuse the same scratch memory can be clobbered.
-    //
-    // Insert s_dcache_wb at wave termination points if there were any scalar
-    // stores, and only if the cache hasn't already been flushed. This could be
-    // improved by looking across blocks for flushes in postdominating blocks
-    // from the stores but an explicitly requested flush is probably very rare.
-    for (MachineBasicBlock *MBB : EndPgmBlocks) {
-      bool SeenDCacheWB = false;
-
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I) {
-        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
-          SeenDCacheWB = true;
-        else if (TII->isScalarStore(*I))
-          SeenDCacheWB = false;
-
-        // FIXME: It would be better to insert this before a waitcnt if any.
-        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
-          Changes = true;
-          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
-        }
-      }
-    }
-  }
-
-  for (MachineInstr *I : RemoveMI)
-    I->eraseFromParent();
-
-  if (!MFI->isEntryFunction()) {
-    // Wait for any outstanding memory operations that the input registers may
-    // depend on. We can't track them and it's better to the wait after the
-    // costly call sequence.
-
-    // TODO: Could insert earlier and schedule more liberally with operations
-    // that only use caller preserved registers.
-    MachineBasicBlock &EntryBB = MF.front();
-    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-      .addImm(0);
-
-    Changes = true;
-  }
-
-  return Changes;
-}
Index: llvm/trunk/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
+# RUN: llc -run-pass si-insert-waitcnts -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
 --- |
 
   define amdgpu_kernel void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
@@ -47,6 +47,7 @@
 ---
 # CHECK-LABEL: name: vccz_corrupt_workaround
 # CHECK: $vcc = V_CMP_EQ_F32
+# CHECK-NEXT: S_WAITCNT 0
 # CHECK-NEXT: $vcc = S_MOV_B64 $vcc
 # CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit killed $vcc
 
@@ -75,13 +76,12 @@
   hasMustTailInVarArgFunc: false
 body:             |
   bb.0.entry:
-    liveins: $sgpr0_sgpr1
+    liveins: $sgpr0_sgpr1, $vcc
 
-    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 9, 0 :: (non-temporal dereferenceable invariant load 4 from `float addrspace(2)* undef`)
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vcc = V_CMP_EQ_F32_e64 0, 0, 0, $sgpr2, 0, implicit $exec
+    $vcc = V_CMP_EQ_F32_e64 0, 0, 0, undef $sgpr2, 0, implicit $exec
     S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
 
   bb.2.if:
@@ -110,8 +110,8 @@
 ...
 ---
 # CHECK-LABEL: name: vccz_corrupt_undef_vcc
-# CHECK: S_WAITCNT
-# CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
+# CHECK: S_WAITCNT 3855
+# CHECK-NEXT: $vgpr0 = V_MOV_B32_e32
 
 name:            vccz_corrupt_undef_vcc
 alignment:       0
Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waits  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waitcnts  %s -o - | FileCheck %s
 
 --- |
   define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
@@ -25,26 +25,26 @@
 # CHECK: FLAT_LOAD_DWORD
 # CHECK: FLAT_LOAD_DWORDX4
 # Global loads will return in order so we should:
-# s_waitcnt vmcnt(1) lgkmcnt(0)
-# CHECK-NEXT: S_WAITCNT 113
+# s_waitcnt vmcnt(1) lgkmcnt(1)
+# CHECK-NEXT: S_WAITCNT 369
 
 # CHECK-LABEL: bb.1:
 # CHECK: FLAT_LOAD_DWORD
-# CHECK: S_WAITCNT 3952
+# CHECK: S_WAITCNT 368
 # CHECK: FLAT_LOAD_DWORDX4
 # The first load has no mem operand, so we should assume it accesses the flat
 # address space.
-# s_waitcnt vmcnt(0) lgkmcnt(0)
-# CHECK-NEXT: S_WAITCNT 127
+# s_waitcnt lgkmcnt(1)
+# CHECK-NEXT: S_WAITCNT 383
 
 # CHECK-LABEL: bb.2:
 # CHECK: FLAT_LOAD_DWORD
-# CHECK: S_WAITCNT 3952
+# CHECK: S_WAITCNT 368
 # CHECK: FLAT_LOAD_DWORDX4
 
-# One outstand loads access the flat address space.
-# s_waitcnt vmcnt(0) lgkmcnt(0)
-# CHECK-NEXT: S_WAITCNT 127
+# One outstanding load accesses the flat address space.
+# s_waitcnt lgkmcnt(1)
+# CHECK-NEXT: S_WAITCNT 383
 
 name: flat_zero_waitcnt
 
@@ -99,8 +99,6 @@
 
 
 # CHECK-LABEL: name: single_branch_successor_not_next_block
-# CHECK:   $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2
-# CHECK-NEXT: S_WAITCNT 112
 
 # CHECK: bb.1
 # CHECK-NEXT: FLAT_STORE_DWORD
@@ -108,6 +106,7 @@
 
 # CHECK: bb.2:
 # CHECK-NEXT: V_LSHLREV_B64
+# CHECK-NEXT: S_WAITCNT 112
 # CHECK-NEXT: FLAT_STORE_DWORD
 name: single_branch_successor_not_next_block