diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -952,12 +952,12 @@
 
   void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MBBI,
-                             const DebugLoc &DL, Register Reg,
-                             bool IsSCCLive) const;
+                             const DebugLoc &DL, Register Reg, bool IsSCCLive,
+                             SlotIndexes *Indexes = nullptr) const;
 
   void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-                   Register Reg) const;
+                   Register Reg, SlotIndexes *Indexes = nullptr) const;
 
   /// Return the correct register class for \p OpNo.  For target-specific
   /// instructions, this will return the register class that has been defined
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4952,7 +4952,8 @@
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
                                         const DebugLoc &DL, Register Reg,
-                                        bool IsSCCLive) const {
+                                        bool IsSCCLive,
+                                        SlotIndexes *Indexes) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   bool IsWave32 = ST.isWave32();
@@ -4962,25 +4963,36 @@
     // the single instruction S_OR_SAVEEXEC that clobbers SCC.
     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill);
-    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+    auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
+                           .addReg(Exec, RegState::Kill);
+    auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+    if (Indexes) {
+      Indexes->insertMachineInstrInMaps(*StoreExecMI);
+      Indexes->insertMachineInstrInMaps(*FlipExecMI);
+    }
   } else {
     const unsigned OrSaveExec =
         IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
     auto SaveExec =
         BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
     SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
+    if (Indexes)
+      Indexes->insertMachineInstrInMaps(*SaveExec);
   }
 }
 
 void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              const DebugLoc &DL, Register Reg) const {
+                              const DebugLoc &DL, Register Reg,
+                              SlotIndexes *Indexes) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec).addReg(Reg, RegState::Kill);
+  auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+                           .addReg(Reg, RegState::Kill);
+  if (Indexes)
+    Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
 }
 
 static const TargetRegisterClass *
diff --git a/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp b/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp
--- a/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp
@@ -21,6 +21,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -49,6 +50,13 @@
   }
 
 private:
+  bool isWWMCopy(const MachineInstr &MI);
+  bool isSCCLiveAtMI(const MachineInstr &MI);
+  void addToWWMSpills(MachineFunction &MF, Register Reg);
+
+  LiveIntervals *LIS;
+  SlotIndexes *Indexes;
+  VirtRegMap *VRM;
   const SIRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
   SIMachineFunctionInfo *MFI;
@@ -56,18 +64,75 @@
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS(SISimplifyPredicatedCopies, DEBUG_TYPE,
-                "SI Simplify Predicated Copies", false, false)
+INITIALIZE_PASS_BEGIN(SISimplifyPredicatedCopies, DEBUG_TYPE,
+                      "SI Simplify Predicated Copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(SISimplifyPredicatedCopies, DEBUG_TYPE,
+                    "SI Simplify Predicated Copies", false, false)
 
 char SISimplifyPredicatedCopies::ID = 0;
 
 char &llvm::SISimplifyPredicatedCopiesID = SISimplifyPredicatedCopies::ID;
 
+// Returns true if \p MI is a whole-wave copy instruction. Iterate
+// recursively skipping the intermediate copies if it maps to any
+// whole-wave operation.
+bool SISimplifyPredicatedCopies::isWWMCopy(const MachineInstr &MI) {
+  // Skip if it is a subreg copy.
+  if (!MI.isFullCopy())
+    return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+
+  if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
+    return true;
+
+  if (SrcReg.isPhysical())
+    return false;
+
+  // Look recursively skipping intermediate copies.
+  const MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg);
+  if (!DefMI || !DefMI->isCopy())
+    return false;
+
+  return isWWMCopy(*DefMI);
+}
+
+bool SISimplifyPredicatedCopies::isSCCLiveAtMI(const MachineInstr &MI) {
+  // We can't determine the liveness info if LIS isn't available. Early return
+  // in that case and always assume SCC is live.
+  if (!LIS)
+    return true;
+
+  LiveRange &LR =
+      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+  SlotIndex Idx = LIS->getInstructionIndex(MI);
+  return LR.liveAt(Idx);
+}
+
+// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills
+// for preserving its entire lanes at function prolog/epilog.
+void SISimplifyPredicatedCopies::addToWWMSpills(MachineFunction &MF,
+                                                Register Reg) {
+  if (!VRM || Reg.isPhysical())
+    return;
+
+  Register PhysReg = VRM->getPhys(Reg);
+  assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
+         "should have allocated a physical register");
+
+  MFI->allocateWWMSpill(MF, PhysReg);
+}
+
 bool SISimplifyPredicatedCopies::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
   MFI = MF.getInfo<SIMachineFunctionInfo>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+  Indexes = getAnalysisIfAvailable<SlotIndexes>();
+  VRM = getAnalysisIfAvailable<VirtRegMap>();
   TRI = ST.getRegisterInfo();
   MRI = &MF.getRegInfo();
   bool Changed = false;
@@ -75,13 +140,19 @@
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
       if (MI.getOpcode() == AMDGPU::PRED_COPY) {
-
-        // Below asserts still fails cause isVgprCopy call isCopy() which is
-        // false for PRED_COPY. Figure out a way to use isCopyInstr()
-        // everywhere. assert(TII->isVGPRCopy(MI));
-
-        // Whole wave register copy logic goes here //
-
+        assert(TII->isVGPRCopy(MI));
+        if (MI.getOperand(0).getReg().isVirtual() && isWWMCopy(MI)) {
+          // For WWM vector copies, manipulate the exec mask around the copy
+          // instruction.
+          DebugLoc DL = MI.getDebugLoc();
+          MachineBasicBlock::iterator InsertPt = MI.getIterator();
+          Register RegForExecCopy = MFI->getSGPRForEXECCopy();
+          TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy,
+                                     isSCCLiveAtMI(MI), Indexes);
+          TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes);
+          addToWWMSpills(MF, MI.getOperand(0).getReg());
+          LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI);
+        }
         // Lower PRED_COPY to COPY
         LLVM_DEBUG(dbgs() << MI << " to use COPY opcode");
         MI.setDesc(TII->get(AMDGPU::COPY));
diff --git a/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir b/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir
@@ -0,0 +1,20 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-simplify-predicated-copies -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+# The test goes into infinite loop while checking for isWWMCopy().
+# getUniqueVRegDef of the SrcReg returns the instruction itself if it is a partial copy.
+# wwm-copies will always be a full copy and hence skip subreg copies while checking for one.
+
+---
+name:            subreg_copy
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: false
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: subreg_copy
+    ; GCN: dead undef %0.sub3:vreg_128_align2 = COPY undef %0.sub1
+    ; GCN-NEXT: SI_RETURN
+    dead undef %0.sub3:vreg_128_align2 = PRED_COPY undef %0.sub1
+    SI_RETURN
+...