Index: lib/Target/AMDGPU/GCNHazardRecognizer.h
===================================================================
--- lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -40,6 +40,7 @@
                             std::function<bool(MachineInstr*)> IsHazardDef =
                             [](MachineInstr*) {return true;});
 
+  int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
   int checkSMRDHazards(MachineInstr *SMRD);
   int checkVMEMHazards(MachineInstr* VMEM);
   int checkDPPHazards(MachineInstr *DPP);
Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp
===================================================================
--- lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -137,17 +137,83 @@
 // No-op Hazard Detection
 //===----------------------------------------------------------------------===//
 
+static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops,
+                         std::set<unsigned> &Set) {
+  for (const MachineOperand &Op : Ops) {
+    if (Op.isReg())
+      Set.insert(Op.getReg());
+  }
+}
+
+int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+  // SMEM soft clause are only present on VI+
+  if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  // FIXME: When we support stores, we need to make sure not to put loads and
+  // stores in the same clause if they use the same address.
+  assert(!SMEM->mayStore() && "hazard recognizer does not support SMEM stores");
+
+  // A soft-clause is any group of consecutive SMEM instructions.  The
+  // instructions in this group may return out of order and/or may be
+  // replayed (i.e. the same instruction issued more than once).
+  //
+  // In order to handle these situations correctly we need to make sure
+  // that when a clause has more than one instruction, no instruction in the
+  // clause writes to a register that is read another instruction in the clause
+  // (including itself). If we encounter this situaion, we need to break the
+  // clause by inserting a non SMEM instruction.
+
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+  std::set<unsigned> ClauseDefs;
+  std::set<unsigned> ClauseUses;
+
+  for (MachineInstr *MI : EmittedInstrs) {
+
+    // When we hit a non-SMEM instruction then we have passed the start of the
+    // clause and we can stop.
+    if (!MI || !TII->isSMRD(*MI))
+      break;
+
+    addRegsToSet(MI->defs(), ClauseDefs);
+    addRegsToSet(MI->uses(), ClauseUses);
+  }
+
+  if (ClauseDefs.empty())
+    return 0;
+
+  addRegsToSet(SMEM->defs(), ClauseDefs);
+  addRegsToSet(SMEM->uses(), ClauseUses);
+
+  std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size()));
+  std::vector<unsigned>::iterator End;
+
+  End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(),
+                              ClauseUses.begin(), ClauseUses.end(), Result.begin());
+
+  // If the set of defs and uses intersect then we cannot add this instruction
+  // to the clause, so we have a hazard.
+  if (End != Result.begin())
+    return 1;
+
+  return 0;
+}
+
 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+  int WaitStatesNeeded = 0;
+
+  WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
 
   if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
-    return 0;
+    return WaitStatesNeeded;
 
   // A read of an SGPR by SMRD instruction requires 4 wait states when the
   // SGPR was written by a VALU instruction.
   int SmrdSgprWaitStates = 4;
-  int WaitStatesNeeded = 0;
   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
 
   for (const MachineOperand &Use : SMRD->uses()) {
Index: lib/Target/AMDGPU/SIInsertWaits.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaits.cpp
+++ lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -306,8 +306,7 @@
     // and destination registers don't overlap, e.g. this is illegal:
     //   r0 = load r2
     //   r2 = load r0
-    if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
-        (LastOpcodeType == VMEM && Increment.Named.VM)) {
+    if (LastOpcodeType == VMEM && Increment.Named.VM) {
       // Insert a NOP to break the clause.
       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
           .addImm(0);