Index: lib/Target/AMDGPU/SIFrameLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.cpp
+++ lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -523,22 +523,20 @@
 // but we would then have to make sure that we were in fact saving at least one
 // callee-save register in the prologue, which is additional complexity that
 // doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
-  MachineFunction *MF = MBB.getParent();
-
-  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
+static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
+                                                 LivePhysRegs &LiveRegs,
+                                                 const TargetRegisterClass &RC) {
+  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
-  LivePhysRegs LiveRegs(TRI);
-  LiveRegs.addLiveIns(MBB);
 
   // Mark callee saved registers as used so we will not choose them.
-  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i)
     LiveRegs.addReg(CSRegs[i]);
 
-  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+  for (unsigned Reg : RC) {
     if (LiveRegs.available(MRI, Reg))
       return Reg;
   }
@@ -561,6 +559,7 @@
 
   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+  LivePhysRegs LiveRegs;
 
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL;
@@ -578,7 +577,12 @@
 
     RoundedSize += Alignment;
 
-    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+    LiveRegs.init(TRI);
+    LiveRegs.addLiveIns(MBB);
+
+    unsigned ScratchSPReg
+      = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+                                         AMDGPU::SReg_32_XM0RegClass);
     assert(ScratchSPReg != AMDGPU::NoRegister);
 
     // s_add_u32 tmp_reg, s32, NumBytes
@@ -609,13 +613,33 @@
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
-         : FuncInfo->getSGPRSpillVGPRs()) {
-    if (!Reg.FI.hasValue())
-      continue;
-    TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
-                             Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
-                             &TII->getRegisterInfo());
+  if (!FuncInfo->getSGPRSpillVGPRs().empty()) {
+    if (LiveRegs.empty()) {
+      LiveRegs.init(TRI);
+      LiveRegs.addLiveIns(MBB);
+    }
+
+    // To avoid clobbering VGPRs in lanes that weren't active on function entry,
+    // turn on all lanes before doing the spill to memory.
+    unsigned ScratchExecCopy
+      = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+                                         AMDGPU::SReg_64_XEXECRegClass);
+
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
+      .addImm(-1);
+
+    for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+           : FuncInfo->getSGPRSpillVGPRs()) {
+      if (!Reg.FI.hasValue())
+        continue;
+      TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
+                               Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+                               &TII->getRegisterInfo());
+    }
+
+    // FIXME: Split block and make terminator.
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+      .addReg(ScratchExecCopy);
   }
 }
 
@@ -628,14 +652,32 @@
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc DL;
 
-  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
-         : FuncInfo->getSGPRSpillVGPRs()) {
-    if (!Reg.FI.hasValue())
-      continue;
-    TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
-                              Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
-                              &TII->getRegisterInfo());
+  if (!FuncInfo->getSGPRSpillVGPRs().empty()) {
+    // See emitPrologue
+    LivePhysRegs LiveRegs(*ST.getRegisterInfo());
+    LiveRegs.addLiveIns(MBB);
+
+    unsigned ScratchExecCopy
+      = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+                                         AMDGPU::SReg_64_XEXECRegClass);
+
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
+      .addImm(-1);
+
+    for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+           : FuncInfo->getSGPRSpillVGPRs()) {
+      if (!Reg.FI.hasValue())
+        continue;
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
+                                Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+                                &TII->getRegisterInfo());
+    }
+
+    // FIXME: Split block and make terminator.
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+      .addReg(ScratchExecCopy);
   }
 
   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
@@ -645,8 +687,6 @@
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   uint32_t NumBytes = MFI.getStackSize();
 
-  DebugLoc DL;
-
   // FIXME: Clarify distinction between no set SP and SP. For callee functions,
   // it's really whether we need SP to be accurate or not.
 
Index: test/CodeGen/AMDGPU/byval-frame-setup.ll
===================================================================
--- test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -30,11 +30,11 @@
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
 ; GCN: s_mov_b32 s5, s32
+; GCN: s_add_u32 s32, s32, 0xc00{{$}}
 ; GCN-DAG: buffer_store_dword v32
 ; GCN-DAG: buffer_store_dword v33
 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
 ; GCN-DAG: v_writelane_b32
-; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
 ; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
Index: test/CodeGen/AMDGPU/call-preserved-registers.ll
===================================================================
--- test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -38,8 +38,8 @@
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: s_mov_b32 s5, s33
-; GCN: v_readlane_b32 s37, v32, 4
+; GCN-DAG: s_mov_b32 s5, s33
+; GCN-DAG: v_readlane_b32 s37, v32, 4
 ; GCN: v_readlane_b32 s36, v32, 3
 ; GCN: v_readlane_b32 s35, v32, 2
 ; GCN: v_readlane_b32 s34, v32, 1
@@ -59,7 +59,7 @@
 ; GCN-NEXT: s_mov_b32 s5, s33
 ; GCN-NEXT: s_mov_b32 s33, s5
 ; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: s_mov_b32 s5, s33
+; GCN: s_mov_b32 s5, s33
 define void @test_func_call_external_void_funcx2() #0 {
   call void @external_void_func_void()
   call void @external_void_func_void()
@@ -175,7 +175,7 @@
 ; GCN-NEXT: ; clobber
 ; GCN-NEXT: #ASMEND
 ; GCN-NEXT:	v_readlane_b32 s33, v0, 0
-; GCN-NEXT: s_setpc_b64
+; GCN: s_setpc_b64
 define hidden void @void_func_void_clobber_s33() #2 {
   call void asm sideeffect "; clobber", "~{s33}"() #0
   ret void
Index: test/CodeGen/AMDGPU/callee-frame-setup.ll
===================================================================
--- test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -37,19 +37,19 @@
 ; GCN: ; %bb.0:
 ; GCN-NEXT: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
+; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8
 
 ; GCN-DAG: v_writelane_b32 v32, s33,
 ; GCN-DAG: v_writelane_b32 v32, s34,
 ; GCN-DAG: v_writelane_b32 v32, s35,
-; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
 ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
 ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
 ; GCN-DAG: s_mov_b32 s33, s5
 
 
 ; GCN: s_swappc_b64
-; GCN: s_mov_b32 s5, s33
+; GCN-DAG: s_mov_b32 s5, s33
 ; GCN-DAG: v_readlane_b32 s35,
 ; GCN-DAG: v_readlane_b32 s34,
 ; GCN-DAG: v_readlane_b32 s33,
@@ -72,7 +72,9 @@
 ; GCN-LABEL: {{^}}callee_no_stack_with_call:
 ; GCN: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-DAG: v_writelane_b32 v32, s33, 0
 ; GCN-DAG: v_writelane_b32 v32, s34, 1
 ; GCN: s_mov_b32 s33, s5
@@ -81,9 +83,12 @@
 
 ; GCN-DAG: v_readlane_b32 s34, v32, 1
 ; GCN-DAG: v_readlane_b32 s33, v32, 0
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
-; GCN: s_sub_u32 s32, s32, 0x400
 
+; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+
+; GCN: s_sub_u32 s32, s32, 0x400
 ; GCN: s_setpc_b64
 define void @callee_no_stack_with_call() #0 {
   call void @external_void_func_void()
@@ -94,11 +99,18 @@
 
 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored
 ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+
 ; GCN: v_writelane_b32 v32
 ; GCN: ;;#ASMSTART
 ; GCN: v_readlane_b32 s{{[0-9]+}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+
+; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
===================================================================
--- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -326,8 +326,8 @@
 
 ; Requires loading and storing to stack slot.
 ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
 ; GCN: s_add_u32 s32, s32, 0x400{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
 
 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}}
Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
===================================================================
--- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -28,10 +28,12 @@
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v32, s33, 0
 ; GCN-NEXT:    v_writelane_b32 v32, s34, 1
-; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v32, s35, 2
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func_v2f32@rel32@lo+4
@@ -39,12 +41,14 @@
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
 ; GCN-NEXT:    s_mov_b32 s33, s5
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
 ; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    v_readlane_b32 s34, v32, 1
 ; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -62,10 +66,12 @@
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v32, s33, 0
 ; GCN-NEXT:    v_writelane_b32 v32, s34, 1
-; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v32, s35, 2
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func_v3f32@rel32@lo+4
@@ -73,12 +79,14 @@
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
 ; GCN-NEXT:    s_mov_b32 s33, s5
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
 ; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    v_readlane_b32 s34, v32, 1
 ; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -96,10 +104,12 @@
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v32, s33, 0
 ; GCN-NEXT:    v_writelane_b32 v32, s34, 1
-; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v32, s35, 2
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func_v4f16@rel32@lo+4
@@ -107,12 +117,14 @@
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
 ; GCN-NEXT:    s_mov_b32 s33, s5
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
 ; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    v_readlane_b32 s34, v32, 1
 ; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -130,10 +142,12 @@
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v32, s33, 0
 ; GCN-NEXT:    v_writelane_b32 v32, s34, 1
-; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v32, s35, 2
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func_struct@rel32@lo+4
@@ -141,13 +155,15 @@
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
 ; GCN-NEXT:    s_mov_b32 s33, s5
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
 ; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
Index: test/CodeGen/AMDGPU/nested-calls.ll
===================================================================
--- test/CodeGen/AMDGPU/nested-calls.ll
+++ test/CodeGen/AMDGPU/nested-calls.ll
@@ -10,9 +10,12 @@
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
 ; GCN: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
-; Spill CSR VGPR used for SGPR spilling
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
 ; GCN-DAG: s_add_u32 s32, s32, 0x400
+; Spill CSR VGPR used for SGPR spilling
+; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+
 ; GCN-DAG: v_writelane_b32 v32, s33, 0
 ; GCN-DAG: v_writelane_b32 v32, s34, 1
 ; GCN-DAG: v_writelane_b32 v32, s35, 2
@@ -22,7 +25,10 @@
 ; GCN: v_readlane_b32 s35, v32, 2
 ; GCN: v_readlane_b32 s34, v32, 1
 ; GCN: v_readlane_b32 s33, v32, 0
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+
 ; GCN: s_sub_u32 s32, s32, 0x400
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_i32_imm() #0 {
Index: test/CodeGen/AMDGPU/sibling-call.ll
===================================================================
--- test/CodeGen/AMDGPU/sibling-call.ll
+++ test/CodeGen/AMDGPU/sibling-call.ll
@@ -207,13 +207,17 @@
 ; Have another non-tail in the function
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
 ; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12
+; GCN: s_add_u32 s32, s32, 0x400
+
+; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:12
+; GCN-NEXT: s_mov_b64 exec
+
 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
 ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
 ; GCN-DAG: v_writelane_b32 v34, s33, 0
 ; GCN-DAG: v_writelane_b32 v34, s34, 1
 ; GCN-DAG: v_writelane_b32 v34, s35, 2
-; GCN-DAG: s_add_u32 s32, s32, 0x400
 
 ; GCN-DAG: s_getpc_b64
 ; GCN: s_swappc_b64
@@ -228,7 +232,10 @@
 
 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4
 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
-; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12
+; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:12
+; GCN-NEXT: s_mov_b64 exec
+
 ; GCN: s_sub_u32 s32, s32, 0x400
 ; GCN: s_setpc_b64 s[6:7]
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {