Index: llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
===================================================================
--- llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -68,9 +68,16 @@
   MachineBasicBlock::iterator CopyMI = MI;
   --CopyMI;
 
-  for (const MachineOperand &MO : MI->implicit_operands())
-    if (MO.isReg())
-      CopyMI->addOperand(MO);
+  Register DstReg = MI->getOperand(0).getReg();
+  for (const MachineOperand &MO : MI->implicit_operands()) {
+    CopyMI->addOperand(MO);
+
+    // Be conservative about preserving kills when subregister defs are
+    // involved. If there was implicit kill of a super-register overlapping the
+    // copy result, we would kill the subregisters previous copies defined.
+    if (MO.isKill() && TRI->regsOverlap(DstReg, MO.getReg()))
+      CopyMI->getOperand(CopyMI->getNumOperands() - 1).setIsKill(false);
+  }
 }
 
 bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
Index: llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir
@@ -0,0 +1,24 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs -run-pass=postrapseudos -o - %s | FileCheck %s
+
+# The copy has an implicit def of a superregister which overlaps the
+# register it defines. We cannot preserve the kill on the tuple def
+# when copying implicit operands to the last inserted v_mov_b32, since
+# it kills the subregister defined earlier in the expansion.
+
+---
+name: copy_has_implicit_kill_superreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; CHECK-LABEL: name: copy_has_implicit_kill_superreg
+    ; CHECK: renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit-def $vgpr7_vgpr8, implicit $vgpr10_vgpr11
+    ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr10_vgpr11, implicit $vgpr7_vgpr8_vgpr9_vgpr10
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr7
+    renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF
+    renamable $vgpr7_vgpr8 = COPY killed renamable $vgpr10_vgpr11, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10
+    S_ENDPGM 0, implicit $vgpr7
+
+...
Index: llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s
+
+; Testcase which happened to trigger a liveness verifier error
+define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 {
+; CHECK-LABEL: test_long_add4:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:16
+; CHECK-NEXT:    ; kill: def $vgpr7_vgpr8_vgpr9_vgpr10 killed $vgpr7_vgpr8_vgpr9_vgpr10 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 killed $exec
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v14, v3
+; CHECK-NEXT:    v_mov_b32_e32 v13, v2
+; CHECK-NEXT:    v_mov_b32_e32 v12, v1
+; CHECK-NEXT:    v_mov_b32_e32 v11, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:16
+; CHECK-NEXT:    ; kill: def $vgpr18_vgpr19_vgpr20_vgpr21 killed $vgpr18_vgpr19_vgpr20_vgpr21 def $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 killed $exec
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v25, v3
+; CHECK-NEXT:    v_mov_b32_e32 v24, v2
+; CHECK-NEXT:    v_mov_b32_e32 v23, v1
+; CHECK-NEXT:    v_mov_b32_e32 v22, v0
+; CHECK-NEXT:    v_mov_b32_e32 v2, v7
+; CHECK-NEXT:    v_mov_b32_e32 v3, v8
+; CHECK-NEXT:    v_mov_b32_e32 v0, v9
+; CHECK-NEXT:    v_mov_b32_e32 v1, v10
+; CHECK-NEXT:    v_mov_b32_e32 v5, v11
+; CHECK-NEXT:    v_mov_b32_e32 v6, v12
+; CHECK-NEXT:    v_mov_b32_e32 v12, v13
+; CHECK-NEXT:    v_mov_b32_e32 v13, v14
+; CHECK-NEXT:    v_mov_b32_e32 v8, v18
+; CHECK-NEXT:    v_mov_b32_e32 v9, v19
+; CHECK-NEXT:    v_mov_b32_e32 v16, v20
+; CHECK-NEXT:    v_mov_b32_e32 v17, v21
+; CHECK-NEXT:    v_mov_b32_e32 v14, v22
+; CHECK-NEXT:    v_mov_b32_e32 v15, v23
+; CHECK-NEXT:    v_mov_b32_e32 v10, v24
+; CHECK-NEXT:    v_mov_b32_e32 v11, v25
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v2, v3
+; CHECK-NEXT:    v_mov_b32_e32 v7, v8
+; CHECK-NEXT:    v_mov_b32_e32 v3, v9
+; CHECK-NEXT:    v_add_co_u32 v7, s6, v4, v7
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v2, s6, v2, v3, s6
+; CHECK-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; CHECK-NEXT:    v_mov_b32_e32 v8, v2
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v3, v16
+; CHECK-NEXT:    v_mov_b32_e32 v1, v17
+; CHECK-NEXT:    v_add_co_u32 v3, s6, v2, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v0, s6, v0, v1, s6
+; CHECK-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, v5
+; CHECK-NEXT:    v_mov_b32_e32 v0, v6
+; CHECK-NEXT:    v_mov_b32_e32 v5, v14
+; CHECK-NEXT:    v_mov_b32_e32 v2, v15
+; CHECK-NEXT:    v_add_co_u32 v1, s6, v1, v5
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v0, s6, v0, v2, s6
+; CHECK-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v12
+; CHECK-NEXT:    v_mov_b32_e32 v0, v13
+; CHECK-NEXT:    v_mov_b32_e32 v9, v10
+; CHECK-NEXT:    v_mov_b32_e32 v6, v11
+; CHECK-NEXT:    v_add_co_u32 v5, s6, v5, v9
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v0, s6, v0, v6, s6
+; CHECK-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; CHECK-NEXT:    v_mov_b32_e32 v6, v0
+; CHECK-NEXT:    ; kill: def $vgpr7_vgpr8 killed $vgpr7_vgpr8 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
+; CHECK-NEXT:    v_mov_b32_e32 v10, v4
+; CHECK-NEXT:    v_mov_b32_e32 v9, v3
+; CHECK-NEXT:    ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; CHECK-NEXT:    v_mov_b32_e32 v3, v5
+; CHECK-NEXT:    v_mov_b32_e32 v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v6, s5
+; CHECK-NEXT:    v_mov_b32_e32 v5, s4
+; CHECK-NEXT:    global_store_dwordx4 v[5:6], v[7:10], off
+; CHECK-NEXT:    s_mov_b64 s[4:5], 16
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %load0 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32
+  %load1 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32
+  %add = add <4 x i64> %load0, %load1
+  store <4 x i64> %add, <4 x i64> addrspace(1)* null, align 32
+  ret void
+}
+
+attributes #0 = { noinline optnone }
Index: llvm/test/CodeGen/X86/pr28560.ll
===================================================================
--- llvm/test/CodeGen/X86/pr28560.ll
+++ llvm/test/CodeGen/X86/pr28560.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=i686-pc-linux -print-after=postrapseudos < %s 2>&1 | FileCheck %s
 
-; CHECK: MOV8rr ${{[a-d]}}l, implicit killed $e[[R:[a-d]]]x, implicit-def $e[[R]]x
+; CHECK: MOV8rr ${{[a-d]}}l, implicit $e[[R:[a-d]]]x, implicit-def $e[[R]]x
 define i32 @foo(i32 %i, i32 %k, i8* %p) {
   %f = icmp ne i32 %i, %k
   %s = zext i1 %f to i8