Index: lib/CodeGen/TwoAddressInstructionPass.cpp
===================================================================
--- lib/CodeGen/TwoAddressInstructionPass.cpp
+++ lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1437,6 +1437,16 @@
   return AnyOps;
 }
 
+static void substituteRegisterUses(MachineRegisterInfo &MRI,
+                                   MachineInstr &MI,
+                                   unsigned OldReg, unsigned NewReg) {
+  for (MachineOperand &MO : MI.operands()) {
+    // Skip registers that are tied to another operand.
+    if (MO.isReg() && !MO.isTied() && MO.getReg() == OldReg)
+      MO.setReg(NewReg);
+  }
+}
+
 // Process a list of tied MI operands that all use the same source register.
 // The tied pairs are of the form (SrcIdx, DstIdx).
 void
@@ -1548,6 +1558,9 @@
     // whether the dest oper writes a subreg, the source oper should not.
     MO.setSubReg(0);
 
+    if (!DstMO.isEarlyClobber())
+      substituteRegisterUses(*MRI, *MI, RegB, RegA);
+
     // Propagate SrcRegMap.
     SrcRegMap[RegA] = RegB;
   }
Index: test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
@@ -0,0 +1,19 @@
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; FIXME: Merge into indirect-addressing-si.ll
+
+; Make sure that TwoAddressInstructions keeps src0 as subregister sub0
+; of the tied implicit use and def of the super register.
+
+; CHECK-LABEL: {{^}}insert_wo_offset:
+; CHECK: s_load_dword [[IN:s[0-9]+]]
+; CHECK: s_mov_b32 m0, [[IN]]
+; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
+; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
+define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+entry:
+  %ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
+  store <4 x float> %ins, <4 x float> addrspace(1)* %out
+  ret void
+}
+
Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll
===================================================================
--- test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -125,27 +125,32 @@
 }
 
 ; CHECK-LABEL: {{^}}insert_w_offset:
-; CHECK: s_load_dword [[IN:s[0-9]+]]
-; CHECK: s_mov_b32 m0, [[IN]]
-; CHECK: v_movreld_b32_e32
-define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
+; CHECK-DAG: s_load_dword [[IN:s[0-9]+]]
+; CHECK-DAG: s_mov_b32 m0, [[IN]]
+; CHECK-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
+; CHECK-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
+; CHECK-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
+; CHECK-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
+; CHECK-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000
+; CHECK: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
+; CHECK: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
+define void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
   %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
-  %2 = extractelement <4 x float> %1, i32 2
-  store float %2, float addrspace(1)* %out
+  store <4 x float> %1, <4 x float> addrspace(1)* %out
   ret void
 }
 
 ; CHECK-LABEL: {{^}}insert_wo_offset:
 ; CHECK: s_load_dword [[IN:s[0-9]+]]
 ; CHECK: s_mov_b32 m0, [[IN]]
-; CHECK: v_movreld_b32_e32
-define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
+; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
+; CHECK: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
+define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
-  %1 = extractelement <4 x float> %0, i32 2
-  store float %1, float addrspace(1)* %out
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
   ret void
 }