Index: lib/Target/AMDGPU/GCNHazardRecognizer.h
===================================================================
--- lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
 #define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include <list>
@@ -24,6 +25,7 @@
 class MachineInstr;
 class ScheduleDAG;
 class SIInstrInfo;
+class SIRegisterInfo;
 class SISubtarget;
 
 class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
@@ -35,6 +37,20 @@
   const MachineFunction &MF;
   const SISubtarget &ST;
   const SIInstrInfo &TII;
+  const SIRegisterInfo &TRI;
+
+  /// RegUnits of uses in the current soft memory clause.
+  BitVector ClauseUses;
+
+  /// RegUnits of defs in the current soft memory clause.
+  BitVector ClauseDefs;
+
+  void resetClause() {
+    ClauseUses.reset();
+    ClauseDefs.reset();
+  }
+
+  void addClauseInst(const MachineInstr &MI);
 
   int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
   int getWaitStatesSinceDef(unsigned Reg,
Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp
===================================================================
--- lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -40,7 +40,10 @@
   CurrCycleInstr(nullptr),
   MF(MF),
   ST(MF.getSubtarget<SISubtarget>()),
-  TII(*ST.getInstrInfo()) {
+  TII(*ST.getInstrInfo()),
+  TRI(TII.getRegisterInfo()),
+  ClauseUses(TRI.getNumRegUnits()),
+  ClauseDefs(TRI.getNumRegUnits()) {
   MaxLookAhead = 5;
 }
 
@@ -258,19 +261,35 @@
 // No-op Hazard Detection
 //===----------------------------------------------------------------------===//
 
-static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops,
-                         std::set<unsigned> &Set) {
+static void addRegUnits(const SIRegisterInfo &TRI,
+                        BitVector &BV, unsigned Reg) {
+  for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
+    BV.set(*RUI);
+}
+
+static void addRegsToSet(const SIRegisterInfo &TRI,
+                         iterator_range<MachineInstr::const_mop_iterator> Ops,
+                         BitVector &Set) {
   for (const MachineOperand &Op : Ops) {
     if (Op.isReg())
-      Set.insert(Op.getReg());
+      addRegUnits(TRI, Set, Op.getReg());
   }
 }
 
+void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
+  // XXX: Do we need to worry about implicit operands
+  addRegsToSet(TRI, MI.defs(), ClauseDefs);
+  addRegsToSet(TRI, MI.uses(), ClauseUses);
+}
+
 int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
-  // SMEM soft clause are only present on VI+
-  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+  // SMEM soft clause are only present on VI+, and only matter if xnack is
+  // enabled.
+  if (!ST.isXNACKEnabled())
     return 0;
 
+  resetClause();
+
   // A soft-clause is any group of consecutive SMEM instructions.  The
   // instructions in this group may return out of order and/or may be
   // replayed (i.e. the same instruction issued more than once).
@@ -281,21 +300,16 @@
   // (including itself). If we encounter this situaion, we need to break the
   // clause by inserting a non SMEM instruction.
 
-  std::set<unsigned> ClauseDefs;
-  std::set<unsigned> ClauseUses;
-
   for (MachineInstr *MI : EmittedInstrs) {
-
     // When we hit a non-SMEM instruction then we have passed the start of the
     // clause and we can stop.
     if (!MI || !SIInstrInfo::isSMRD(*MI))
       break;
 
-    addRegsToSet(MI->defs(), ClauseDefs);
-    addRegsToSet(MI->uses(), ClauseUses);
+    addClauseInst(*MI);
   }
 
-  if (ClauseDefs.empty())
+  if (ClauseDefs.none())
     return 0;
 
   // FIXME: When we support stores, we need to make sure not to put loads and
@@ -304,21 +318,11 @@
   if (SMEM->mayStore())
     return 1;
 
-  addRegsToSet(SMEM->defs(), ClauseDefs);
-  addRegsToSet(SMEM->uses(), ClauseUses);
-
-  std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size()));
-  std::vector<unsigned>::iterator End;
-
-  End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(),
-                              ClauseUses.begin(), ClauseUses.end(), Result.begin());
+  addClauseInst(*SMEM);
 
   // If the set of defs and uses intersect then we cannot add this instruction
   // to the clause, so we have a hazard.
-  if (End != Result.begin())
-    return 1;
-
-  return 0;
+  return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
 }
 
 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
Index: test/CodeGen/AMDGPU/break-smem-soft-clauses.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/break-smem-soft-clauses.mir
@@ -0,0 +1,351 @@
+# RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass  post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
+
+---
+# Trivial clause at beginning of program
+name: trivial_smem_clause_load_smrd4_x1
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x1
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_smem_clause_load_smrd4_x2
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x2
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr1 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_smem_clause_load_smrd4_x3
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x3
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr6_sgpr7, 0, 0
+    ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    %sgpr1 = S_LOAD_DWORD_IMM %sgpr6_sgpr7, 0, 0
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+    S_ENDPGM
+...
+---
+# Trivial clause at beginning of program
+name: trivial_smem_clause_load_smrd4_x4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x4
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: %sgpr1 = S_LOAD_DWORD_IMM %sgpr8_sgpr9, 0, 0
+    ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+    ; GCN-NEXT: %sgpr3 = S_LOAD_DWORD_IMM %sgpr16_sgpr17, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    %sgpr1 = S_LOAD_DWORD_IMM %sgpr8_sgpr9, 0, 0
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+    %sgpr3 = S_LOAD_DWORD_IMM %sgpr16_sgpr17, 0, 0
+    S_ENDPGM
+...
+---
+# Reuse of same input pointer is OK
+name: trivial_smem_clause_load_smrd4_x2_sameptr
+body: |
+  bb.0:
+    ; GCN-LABEL: name: trivial_smem_clause_load_smrd4_x2_sameptr
+    ; GCN: %sgpr12 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr12 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    S_ENDPGM
+...
+---
+# 32-bit load partially clobbers its own ptr reg
+name: smrd_load4_overwrite_ptr_lo
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: smrd_load4_overwrite_ptr_lo
+    ; GCN: %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    S_ENDPGM
+...
+---
+# 32-bit load partially clobbers its own ptr reg
+name: smrd_load4_overwrite_ptr_hi
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: smrd_load4_overwrite_ptr_hi
+    ; GCN: %sgpr11 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr11 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    S_ENDPGM
+...
+---
+# 64-bit load clobbers its own ptr reg
+name: smrd_load8_overwrite_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: smrd_load8_overwrite_ptr
+    ; GCN: %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+    S_ENDPGM
+...
+---
+# vmcnt has 4 bits, so maximum 16 outstanding loads. The waitcnt
+# breaks the clause.
+
+name: break_smem_clause_at_max_smem_clause_size_smrd_load4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_smem_clause_at_max_smem_clause_size_smrd_load4
+    ; GCN: %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr28 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr0 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0
+    ; GCN-NEXT: %sgpr0 = S_MOV_B32 %sgpr0, implicit %sgpr13, implicit %sgpr14, implicit %sgpr15, implicit %sgpr16, implicit %sgpr17, implicit %sgpr18, implicit %sgpr19, implicit %sgpr20, implicit %sgpr21, implicit %sgpr22, implicit %sgpr23, implicit %sgpr24, implicit %sgpr25, implicit %sgpr26, implicit %sgpr27, implicit %sgpr28
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr13 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr14 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr15 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr16 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+
+    %sgpr17 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr18 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr19 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr20 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+
+    %sgpr21 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr22 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr23 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr24 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+
+    %sgpr25 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr26 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr27 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr28 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr30_sgpr31, 0, 0
+    %sgpr0 = S_MOV_B32 %sgpr0, implicit %sgpr13, implicit %sgpr14, implicit %sgpr15, implicit %sgpr16, implicit %sgpr17, implicit %sgpr18, implicit %sgpr19, implicit %sgpr20, implicit %sgpr21, implicit %sgpr22, implicit %sgpr23, implicit %sgpr24, implicit %sgpr25, implicit %sgpr26, implicit %sgpr27, implicit %sgpr28
+    S_ENDPGM
+...
+---
+
+name: break_smem_clause_simple_load_smrd4_lo_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_smem_clause_simple_load_smrd4_lo_ptr
+    ; GCN: %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %sgpr12 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr10 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr12 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+
+name: break_smem_clause_simple_load_smrd4_hi_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_smem_clause_simple_load_smrd4_hi_ptr
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr3 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr3 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+
+name: break_smem_clause_simple_load_smrd8_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr
+    ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+
+name: break_smem_clause_simple_load_smrd16_ptr
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_smem_clause_simple_load_smrd16_ptr
+    ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX4_IMM %sgpr6_sgpr7, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX4_IMM %sgpr6_sgpr7, 0, 0
+    S_ENDPGM
+...
+---
+
+name: break_smem_clause_block_boundary_load_smrd8_ptr
+
+body: |
+  ; GCN-LABEL: name: break_smem_clause_block_boundary_load_smrd8_ptr
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+  ; GCN: bb.1:
+  ; XNACK-NEXT:   S_NOP 0
+  ; GCN-NEXT:   %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0
+  ; GCN-NEXT:   S_ENDPGM
+  bb.0:
+    %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0
+
+  bb.1:
+    %sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+# The load clobbers the pointer of the store, so it needs to break.
+
+name: break_smem_clause_store_load_into_ptr_smrd4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_smem_clause_store_load_into_ptr_smrd4
+    ; GCN: S_STORE_DWORD_IMM %sgpr16, %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr12 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    S_STORE_DWORD_IMM %sgpr16, %sgpr10_sgpr11, 0, 0
+    %sgpr12 = S_LOAD_DWORD_IMM %sgpr14_sgpr15, 0, 0
+    S_ENDPGM
+...
+---
+# The load clobbers the data of the store, so it needs to break.
+# FIXME: Would it be better to s_nop and wait later?
+
+name: break_smem_clause_store_load_into_data_smrd4
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: break_smem_clause_store_load_into_data_smrd4
+    ; GCN: S_STORE_DWORD_IMM %sgpr8, %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr8 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    S_STORE_DWORD_IMM %sgpr8, %sgpr10_sgpr11, 0, 0
+    %sgpr8 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+# Regular VALU instruction breaks clause, no nop needed
+name: valu_inst_breaks_smem_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: valu_inst_breaks_smem_clause
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %vgpr8 = V_MOV_B32_e32 0, implicit %exec
+    ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %vgpr8 = V_MOV_B32_e32 0, implicit %exec
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+# Regular SALU instruction breaks clause, no nop needed
+name: salu_inst_breaks_smem_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: salu_inst_breaks_smem_clause
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %sgpr8 = S_MOV_B32 0
+    ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %sgpr8 = S_MOV_B32 0
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+name: ds_inst_breaks_smem_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: ds_inst_breaks_smem_clause
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec
+    ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %vgpr8 = DS_READ_B32 %vgpr9, 0, 0, implicit %m0, implicit %exec
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+
+name: flat_inst_breaks_smem_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: flat_inst_breaks_smem_clause
+    ; GCN: %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    ; GCN-NEXT: %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    ; GCN-NEXT: %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0 = S_LOAD_DWORD_IMM %sgpr10_sgpr11, 0, 0
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr12_sgpr13, 0, 0
+    S_ENDPGM
+...
+---
+# FIXME: Should this be handled?
+name: implicit_use_breaks_smem_clause
+
+body: |
+  bb.0:
+    ; GCN-LABEL: name: implicit_use_breaks_smem_clause
+    ; GCN: %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0, implicit %sgpr12_sgpr13
+    ; XNACK-NEXT: S_NOP 0
+    ; GCN-NEXT: %sgpr12_sgpr13 = S_LOAD_DWORDX2_IMM %sgpr6_sgpr7, 0, 0
+    ; GCN-NEXT: S_ENDPGM
+    %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr10_sgpr11, 0, 0, implicit %sgpr12_sgpr13
+    %sgpr12_sgpr13 = S_LOAD_DWORDX2_IMM %sgpr6_sgpr7, 0, 0
+    S_ENDPGM
+...
Index: test/CodeGen/AMDGPU/immv216.ll
===================================================================
--- test/CodeGen/AMDGPU/immv216.ll
+++ test/CodeGen/AMDGPU/immv216.ll
@@ -282,9 +282,9 @@
 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
 ; GFX9: buffer_store_dword [[REG]]
 
-; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
 ; VI: buffer_load_dword
 ; VI-NOT: and
+; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
 ; VI: v_or_b32
Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
===================================================================
--- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
 
@@ -428,10 +428,12 @@
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+
 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
+; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
@@ -455,11 +457,12 @@
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 
 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
+; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -81,8 +81,8 @@
 }
 
 ; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_b
-; VI:  v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}}
-; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
+; VI-DAG:  v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}}
+; GCN-DAG: buffer_load_ushort v[[C_F16:[0-9]+]]
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
@@ -97,8 +97,8 @@
 }
 
 ; GCN-LABEL: {{^}}div_fixup_f16_imm_b_imm_c
-; VI:  v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}}
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
+; VI-DAG:  v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}}
+; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
@@ -113,8 +113,8 @@
 }
 
 ; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_c
-; VI:  v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}}
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
+; VI-DAG:  v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}}
+; GCN-DAG: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
===================================================================
--- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -45,14 +45,16 @@
 ; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
 ; SI: buffer_load_dword [[VA0:v[0-9]+]]
-; SI: buffer_load_dword [[VA1:v[0-9]+]]
+; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
 
 ; GCN-NOT: v_mov_b32
-; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN-NOT: v_mov_b32
 
 ; VI: buffer_load_dword [[VA0:v[0-9]+]]
-; VI: buffer_load_dword [[VA1:v[0-9]+]]
+; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
+
+; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN-NOT: v_mov_b32
 
 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]]
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]]