Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -227,6 +227,16 @@
   return true;
 }
 
+static bool
+hasPhysRegDef(MachineInstr &MI) {
+  for (const MachineOperand &Def : MI.defs()) {
+    if (Def.isReg() &&
+        TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
+      return true;
+  }
+  return false;
+}
+
 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
   // XXX - Would the same offset be OK? Is there any reason this would happen or
   // be useful?
@@ -349,6 +359,13 @@
         return false;
       }
 
+      if (hasPhysRegDef(*MBBI)) {
+        // We could re-order this instruction in theory, but it would require
+        // tracking physreg defs and uses. This should only affect M0 in
+        // practice.
+        return false;
+      }
+
       if (MBBI->mayLoadOrStore() &&
         (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
          !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
@@ -436,7 +453,8 @@
     // down past this instruction.
     // check if we can move I across MBBI and if we can move all I's users
     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-      !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) ||
+        hasPhysRegDef(*MBBI))
       break;
   }
   return false;
Index: test/CodeGen/AMDGPU/ds_read2.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_read2.ll
+++ test/CodeGen/AMDGPU/ds_read2.ll
@@ -613,6 +613,24 @@
   ret void
 }
 
+; GCN-LABEL: ds_read_call_read:
+; GCN: ds_read_b32
+; GCN: s_swappc_b64
+; GCN: ds_read_b32
+define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) {
+  %x = call i32 @llvm.amdgcn.workitem.id.x()
+  %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x
+  %arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1
+  %v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4
+  call void @void_func_void()
+  %v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %r = add i32 %v0, %v1
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @void_func_void() #3
+
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.y() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
@@ -623,3 +641,4 @@
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { convergent nounwind }
+attributes #3 = { nounwind noinline }
Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -160,21 +160,25 @@
 
 ; SI won't merge ds memory operations, because of the signed offset bug, so
 ; we only have check lines for VI.
-; VI-LABEL: v_interp_readnone:
-; VI: s_mov_b32 m0, 0
-; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
-; VI: s_mov_b32 m0, -1{{$}}
-; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
-bb:
-  store float 0.000000e+00, float addrspace(3)* %lds
-  %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
-  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.000000e+00, float addrspace(3)* %tmp2
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
-  ret void
-}
+;
+; TODO: VI won't merge them either, because we are conservative about moving
+; instructions past changes to physregs.
+;
+; TODO-VI-LABEL: v_interp_readnone:
+; TODO-VI: s_mov_b32 m0, 0
+; TODO-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; TODO-VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
+; TODO-VI: s_mov_b32 m0, -1{{$}}
+; TODO-VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
+;define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
+;bb:
+;  store float 0.000000e+00, float addrspace(3)* %lds
+;  %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
+;  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
+;  store float 0.000000e+00, float addrspace(3)* %tmp2
+;  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+;  ret void
+;}
 
 ; Thest that v_interp_p1 uses different source and destination registers
 ; on 16 bank LDS chips.
Index: test/CodeGen/AMDGPU/smrd.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd.ll
+++ test/CodeGen/AMDGPU/smrd.ll
@@ -242,6 +242,48 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}smrd_imm_nomerge_m0:
+;
+; In principle we could merge the loads here as well, but it would require
+; careful tracking of physical registers since both v_interp* and v_movrel*
+; instructions (or gpr idx mode) use M0.
+;
+; GCN: s_buffer_load_dword
+; GCN: s_buffer_load_dword
+define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
+main_body:
+  %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
+  %idx1 = bitcast float %idx1.f to i32
+
+  %v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim)
+  %v0.x = call nsz float @llvm.amdgcn.interp.p2(float %v0.x1, float %v, i32 0, i32 0, i32 %prim)
+  %v0.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 1, i32 %prim)
+  %v0.y = call nsz float @llvm.amdgcn.interp.p2(float %v0.y1, float %v, i32 0, i32 1, i32 %prim)
+  %v0.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 2, i32 %prim)
+  %v0.z = call nsz float @llvm.amdgcn.interp.p2(float %v0.z1, float %v, i32 0, i32 2, i32 %prim)
+  %v0.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
+  %v0.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
+  %v0 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
+  %a = extractelement <3 x float> %v0, i32 %idx1
+
+  %v1.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 0, i32 %prim)
+  %v1.x = call nsz float @llvm.amdgcn.interp.p2(float %v1.x1, float %v, i32 1, i32 0, i32 %prim)
+  %v1.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 1, i32 %prim)
+  %v1.y = call nsz float @llvm.amdgcn.interp.p2(float %v1.y1, float %v, i32 1, i32 1, i32 %prim)
+  %v1.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 2, i32 %prim)
+  %v1.z = call nsz float @llvm.amdgcn.interp.p2(float %v1.z1, float %v, i32 1, i32 2, i32 %prim)
+  %v1.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
+  %v1.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
+  %v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
+
+  %b = extractelement <3 x float> %v1, i32 %idx1
+  %c = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)
+
+  %res.tmp = fadd float %a, %b
+  %res = fadd float %res.tmp, %c
+  ret float %res
+}
+
 ; GCN-LABEL: {{^}}smrd_vgpr_merged:
 ; GCN-NEXT: %bb.
 
@@ -275,6 +317,9 @@
 
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable }