Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -173,10 +173,18 @@
   }
 }
 
-static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
-  // XXX: Should this be looking for implicit defs?
-  for (const MachineOperand &Def : MI.defs())
-    Defs.insert(Def.getReg());
+static void addDefsUsesToList(const MachineInstr &MI,
+                              DenseSet<unsigned> &RegDefs,
+                              DenseSet<unsigned> &PhysRegUses) {
+  for (const MachineOperand &Op : MI.operands()) {
+    if (Op.isReg()) {
+      if (Op.isDef())
+        RegDefs.insert(Op.getReg());
+      else if (Op.readsReg() &&
+               TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+        PhysRegUses.insert(Op.getReg());
+    }
+  }
 }
 
 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
@@ -194,16 +202,24 @@
 // already in the list. Returns true in that case.
 static bool
 addToListsIfDependent(MachineInstr &MI,
-                      DenseSet<unsigned> &Defs,
+                      DenseSet<unsigned> &RegDefs,
+                      DenseSet<unsigned> &PhysRegUses,
                       SmallVectorImpl<MachineInstr*> &Insts) {
   for (MachineOperand &Use : MI.operands()) {
     // If one of the defs is read, then there is a use of Def between I and the
     // instruction that I will potentially be merged with. We will need to move
     // this instruction after the merged instructions.
-
-    if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
+    //
+    // Similarly, if there is a def which is read by an instruction that is to
+    // be moved for merging, then we need to move the def-instruction as well.
+    // This can only happen for physical registers such as M0; virtual
+    // registers are in SSA form.
+    if (Use.isReg() &&
+        ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+         (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+          PhysRegUses.count(Use.getReg())))) {
       Insts.push_back(&MI);
-      addDefsToList(MI, Defs);
+      addDefsUsesToList(MI, RegDefs, PhysRegUses);
       return true;
     }
   }
@@ -227,16 +243,6 @@
   return true;
 }
 
-static bool
-hasPhysRegDef(MachineInstr &MI) {
-  for (const MachineOperand &Def : MI.defs()) {
-    if (Def.isReg() &&
-        TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
-      return true;
-  }
-  return false;
-}
-
 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
   // XXX - Would the same offset be OK? Is there any reason this would happen or
   // be useful?
@@ -342,8 +348,9 @@
 
   ++MBBI;
 
-  DenseSet<unsigned> DefsToMove;
-  addDefsToList(*CI.I, DefsToMove);
+  DenseSet<unsigned> RegDefsToMove;
+  DenseSet<unsigned> PhysRegUsesToMove;
+  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
   for ( ; MBBI != E; ++MBBI) {
     if (MBBI->getOpcode() != CI.I->getOpcode()) {
@@ -359,13 +366,6 @@
         return false;
       }
 
-      if (hasPhysRegDef(*MBBI)) {
-        // We could re-order this instruction in theory, but it would require
-        // tracking physreg defs and uses. This should only affect M0 in
-        // practice.
-        return false;
-      }
-
       if (MBBI->mayLoadOrStore() &&
         (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
          !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
@@ -373,14 +373,15 @@
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
         CI.InstsToMove.push_back(&*MBBI);
-        addDefsToList(*MBBI, DefsToMove);
+        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
         continue;
       }
 
       // When we match I with another DS instruction we will be moving I down
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
-      addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
+      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+                            CI.InstsToMove);
       continue;
     }
 
@@ -394,7 +395,8 @@
     //   DS_WRITE_B32 addr, f(w), idx1
     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
+    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
+                              CI.InstsToMove))
       continue;
 
     bool Match = true;
@@ -453,8 +455,7 @@
     // down past this instruction.
     // check if we can move I across MBBI and if we can move all I's users
     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) ||
-        hasPhysRegDef(*MBBI))
+        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
       break;
   }
   return false;
Index: test/CodeGen/AMDGPU/smrd.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd.ll
+++ test/CodeGen/AMDGPU/smrd.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI   -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI   -check-prefix=GCN -check-prefix=SICI %s
-; RUN: llc -march=amdgcn -mcpu=tonga   -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI   -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tahiti  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI %s
+; RUN: llc -march=amdgcn -mcpu=tonga   -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI   -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900  -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9  %s
 
 ; SMRD load with an immediate offset.
@@ -242,15 +242,24 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}smrd_imm_nomerge_m0:
+; GCN-LABEL: {{^}}smrd_imm_merge_m0:
 ;
-; In principle we could merge the loads here as well, but it would require
-; careful tracking of physical registers since both v_interp* and v_movrel*
-; instructions (or gpr idx mode) use M0.
+; SICIVI: s_buffer_load_dwordx2
+; SICIVI: s_mov_b32 m0
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p1_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI_DAG: v_interp_p2_f32
+; SICIVI: s_mov_b32 m0
+; SICIVI: v_movrels_b32_e32
 ;
-; GCN: s_buffer_load_dword
-; GCN: s_buffer_load_dword
-define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
+; Merging is still thwarted on GFX9 due to s_set_gpr_idx
+;
+; GFX9: s_buffer_load_dword
+; GFX9: s_buffer_load_dword
+define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
 main_body:
   %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
   %idx1 = bitcast float %idx1.f to i32