diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1644,6 +1644,33 @@
 
     if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+
+    // The epilog restore of a wwm-scratch register can cause undesired
+    // optimization during machine-cp post PrologEpilogInserter if the same
+    // register was assigned for return value ABI lowering with a COPY
+    // instruction. As given below, with the epilog reload, the earlier COPY
+    // appeared to be dead during machine-cp.
+    // ...
+    // v0 in WWM operation, needs the WWM spill at prolog/epilog.
+    // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
+    // ...
+    // Epilog block:
+    // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
+    // ...
+    // WWM spill restore to preserve the inactive lanes of v0.
+    // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
+    // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
+    // $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    // ...
+    // SI_RETURN implicit $vgpr0
+    // ...
+    // To fix it, mark the same reg as a tied op for such restore instructions
+    // so that it marks a usage for the preceding COPY.
+    if (!IsStore && MI != MBB.end() && MI->isReturn() &&
+        MI->readsRegister(SubReg, this)) {
+      MIB.addReg(SubReg, RegState::Implicit);
+      MIB->tieOperands(0, MIB->getNumOperands() - 1);
+    }
   }
 
   if (ScratchOffsetRegDelta != 0) {
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
--- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
+++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
@@ -28,7 +28,7 @@
     ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0
     ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0
     renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
@@ -0,0 +1,142 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+# The COPY that moves the return value to VGPR0 should not be removed during machine-cp. The spill restore of the same register that follows,
+# meant to only reload its inactive lanes. By marking the reg itself as the tied-op in the spill reload prevents the undesired optimization.
+
+---
+name:            wwm_scratch_reg_spill_reload_of_outgoing_reg
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr0']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr1
+    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_reg
+    ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    ; GCN-NEXT: $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0
+...
+
+# The reload of vgpr0 require the tied-op as it is a subreg in the outgoing tuple register vgpr0_vgpr1.
+# The vgpr2 doesn't need the tied-op in the reload as it isn't holding any return value.
+---
+name:            wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr0', '$vgpr2']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $sgpr21, $vgpr1
+    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg
+    ; GCN: liveins: $sgpr20, $sgpr21, $vgpr0, $vgpr1, $vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2
+    ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2
+    $vgpr0 = COPY $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+
+# Tied op not required in the spill reload of vgpr2.
+
+---
+name:            wwm_scratch_reg_spill_reload_different_outgoing_reg
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr2']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr1
+    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_different_outgoing_reg
+    ; GCN: liveins: $sgpr20, $vgpr1, $vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2
+    ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2
+    $vgpr0 = COPY $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+
+# Tied op not required in the spill reload of vgpr40 which is in the CSR range.
+---
+name:            wwm_csr_spill_reload
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr40']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr1
+    ; GCN-LABEL: name: wwm_csr_spill_reload
+    ; GCN: liveins: $sgpr20, $vgpr1, $vgpr40
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr40 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40
+    ; GCN-NEXT: $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec
+    ; GCN-NEXT: $vgpr0 = COPY killed $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0
+    $vgpr40 = IMPLICIT_DEF
+    $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40
+    $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec
+    $vgpr0 = COPY killed $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0
+...