diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -918,12 +918,12 @@
 
   void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MBBI,
-                             const DebugLoc &DL, Register Reg,
-                             bool IsSCCLive) const;
+                             const DebugLoc &DL, Register Reg, bool IsSCCLive,
+                             SlotIndexes *Indexes = nullptr) const;
 
   void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-                   Register Reg) const;
+                   Register Reg, SlotIndexes *Indexes = nullptr) const;
 
   /// Return the correct register class for \p OpNo.  For target-specific
   /// instructions, this will return the register class that has been defined
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4928,7 +4928,8 @@
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
                                         const DebugLoc &DL, Register Reg,
-                                        bool IsSCCLive) const {
+                                        bool IsSCCLive,
+                                        SlotIndexes *Indexes) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   bool IsWave32 = ST.isWave32();
@@ -4938,25 +4939,36 @@
     // the single instruction S_OR_SAVEEXEC that clobbers SCC.
     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill);
-    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+    auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
+                           .addReg(Exec, RegState::Kill);
+    auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+    if (Indexes) {
+      Indexes->insertMachineInstrInMaps(*StoreExecMI);
+      Indexes->insertMachineInstrInMaps(*FlipExecMI);
+    }
   } else {
     const unsigned OrSaveExec =
         IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
     auto SaveExec =
         BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
     SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
+    if (Indexes)
+      Indexes->insertMachineInstrInMaps(*SaveExec);
   }
 }
 
 void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              const DebugLoc &DL, Register Reg) const {
+                              const DebugLoc &DL, Register Reg,
+                              SlotIndexes *Indexes) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec).addReg(Reg, RegState::Kill);
+  auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+                           .addReg(Reg, RegState::Kill);
+  if (Indexes)
+    Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
 }
 
 static const TargetRegisterClass *
diff --git a/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp b/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp
--- a/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp
@@ -14,7 +14,10 @@
 /// scalar registers don't carry any such dependency and hence the regular COPY
 /// opcode can be used. AMDGPU by default uses PRED_COPY opcode right from the
 /// instruction selection and this pass would simplify the COPY opcode and the
-/// implicit operand field as mentioned above.
+/// implicit operand field as mentioned above. This pass also implements the
+/// EXEC MASK manipulation around the whole wave vector register copies by
+/// turning all bits of exec to one before the copy and then restore it
+/// immediately afterwards.
 //
 //===----------------------------------------------------------------------===//
 
@@ -52,6 +55,11 @@
   }
 
 private:
+  bool isWWMCopy(const MachineInstr &MI);
+  bool isSCCLiveAtMI(const MachineInstr &MI);
+
+  LiveIntervals *LIS;
+  SlotIndexes *Indexes;
   const SIRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
   SIMachineFunctionInfo *MFI;
@@ -61,6 +69,7 @@
 
 INITIALIZE_PASS_BEGIN(SISimplifyPredicatedCopies, DEBUG_TYPE,
                       "SI Simplify Predicated Copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_END(SISimplifyPredicatedCopies, DEBUG_TYPE,
                     "SI Simplify Predicated Copies", false, false)
 
@@ -68,11 +77,45 @@
 
 char &llvm::SISimplifyPredicatedCopiesID = SISimplifyPredicatedCopies::ID;
 
+// Returns true if \p MI is a whole-wave copy instruction. Iterate
+// recursively skipping the intermediate copies if it maps to any
+// whole-wave operation.
+bool SISimplifyPredicatedCopies::isWWMCopy(const MachineInstr &MI) {
+  Register SrcReg = MI.getOperand(1).getReg();
+
+  if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
+    return true;
+
+  if (SrcReg.isPhysical())
+    return false;
+
+  // Look recursively skipping intermediate copies.
+  const MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg);
+  if (!DefMI || !DefMI->isCopy())
+    return false;
+
+  return isWWMCopy(*DefMI);
+}
+
+bool SISimplifyPredicatedCopies::isSCCLiveAtMI(const MachineInstr &MI) {
+  // We can't determine the liveness info if LIS isn't available. Early return
+  // in that case and always assume SCC is live.
+  if (!LIS)
+    return true;
+
+  LiveRange &LR =
+      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+  SlotIndex Idx = LIS->getInstructionIndex(MI);
+  return LR.liveAt(Idx);
+}
+
 bool SISimplifyPredicatedCopies::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
   MFI = MF.getInfo<SIMachineFunctionInfo>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+  Indexes = getAnalysisIfAvailable<SlotIndexes>();
   TRI = ST.getRegisterInfo();
   MRI = &MF.getRegInfo();
   bool Changed = false;
@@ -93,6 +136,20 @@
             Changed = true;
           }
         } else {
+          if (TII->isVGPRCopy(MI) &&
+              !TRI->isSGPRReg(*MRI, MI.getOperand(1).getReg()) &&
+              MI.getOperand(0).getReg().isVirtual() && isWWMCopy(MI)) {
+            // For WWM vector copies, manipulate the exec mask around the copy
+            // instruction.
+            DebugLoc DL = MI.getDebugLoc();
+            MachineBasicBlock::iterator InsertPt = MI.getIterator();
+            Register RegForExecCopy = MFI->getSGPRForEXECCopy();
+            TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy,
+                                       isSCCLiveAtMI(MI), Indexes);
+            TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes);
+            LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI);
+          }
+
           // For vector registers, add implicit exec use.
           if (!MI.readsRegister(AMDGPU::EXEC, TRI)) {
             MI.addOperand(MF,