diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1287,6 +1287,20 @@
   return LoadStoreOp;
 }
 
+// Returns true if \p Reg is used for returning a value from this function.
+static bool isReturnValueCCReg(MachineInstr &MI, Register Reg) {
+  if (!MI.isReturn())
+    return false;
+
+  // Check the operand list of return instruction for the CC regs used for
+  // outgoing values.
+  for (MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() == Reg)
+      return true;
+  }
+  return false;
+}
+
 void SIRegisterInfo::buildSpillLoadStore(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
     unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
@@ -1644,6 +1658,32 @@
 
     if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+
+    // The epilog restore of a wwm-scratch register can cause undesired
+    // optimization during machine-cp post PrologEpilogInserter if the same
+    // register was assigned for return value ABI lowering with a COPY
+    // instruction. As given below, with the epilog reload, the earlier COPY
+    // appeared to be dead during machine-cp. To fix it, mark the same reg as
+    // a tied-op for such restore instructions that marks a usage for the
+    // preceding COPY.
+    // ...
+    // v0 in WWM operation, needs the WWM spill at prolog/epilog.
+    // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
+    // ...
+    // Epilog block:
+    // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
+    // ...
+    // WWM spill restore to preserve the inactive lanes of v0.
+    // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
+    // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
+    // $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    // ...
+    // SI_RETURN implicit $vgpr0
+    // ...
+    if (!IsStore && MBB.isReturnBlock() && isReturnValueCCReg(*MI, SubReg)) {
+      MIB.addReg(SubReg, RegState::Implicit);
+      MIB->tieOperands(0, MIB->getNumOperands() - 1);
+    }
   }
 
   if (ScratchOffsetRegDelta != 0) {
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
--- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
+++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
@@ -28,7 +28,7 @@
     ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0
     ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0
     renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+# The COPY that moves the return value to VGPR0 should not be removed during machine-cp. The spill restore of the same register that follows,
+# meant to only reload its inactive lanes. By marking the reg itself as the tied-op in the spill reload prevents the undesired optimization.
+
+---
+name:            wwm_scratch_reg_spill_reload
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr0']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr1
+    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload
+    ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    ; GCN-NEXT: $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0
+...