Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -523,22 +523,20 @@ // but we would then have to make sure that we were in fact saving at least one // callee-save register in the prologue, which is additional complexity that // doesn't seem worth the benefit. -static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) { - MachineFunction *MF = MBB.getParent(); - - const GCNSubtarget &Subtarget = MF->getSubtarget(); +static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF, + LivePhysRegs &LiveRegs, + const TargetRegisterClass &RC) { + const GCNSubtarget &Subtarget = MF.getSubtarget(); const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); - LivePhysRegs LiveRegs(TRI); - LiveRegs.addLiveIns(MBB); // Mark callee saved registers as used so we will not choose them. - const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF); for (unsigned i = 0; CSRegs[i]; ++i) LiveRegs.addReg(CSRegs[i]); - MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); - for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) { + for (unsigned Reg : RC) { if (LiveRegs.available(MRI, Reg)) return Reg; } @@ -561,6 +559,7 @@ unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); + LivePhysRegs LiveRegs; MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; @@ -578,7 +577,12 @@ RoundedSize += Alignment; - unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB); + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + + unsigned ScratchSPReg + = findScratchNonCalleeSaveRegister(MF, LiveRegs, + AMDGPU::SReg_32_XM0RegClass); assert(ScratchSPReg != AMDGPU::NoRegister); // s_add_u32 tmp_reg, s32, NumBytes @@ -609,13 +613,33 @@ .setMIFlag(MachineInstr::FrameSetup); } - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) - continue; - TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); + if (!FuncInfo->getSGPRSpillVGPRs().empty()) { + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + } + + // To avoid clobbering VGPRs in lanes that weren't active on function entry, + // turn on all lanes before doing the spill to memory. + unsigned ScratchExecCopy + = findScratchNonCalleeSaveRegister(MF, LiveRegs, + AMDGPU::SReg_64_XEXECRegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy) + .addImm(-1); + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, + Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, + &TII->getRegisterInfo()); + } + + // FIXME: Split block and make terminator. + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(ScratchExecCopy); } } @@ -628,14 +652,32 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc DL; - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) - continue; - TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); + if (!FuncInfo->getSGPRSpillVGPRs().empty()) { + // See emitPrologue + LivePhysRegs LiveRegs(*ST.getRegisterInfo()); + LiveRegs.addLiveIns(MBB); + + unsigned ScratchExecCopy + = findScratchNonCalleeSaveRegister(MF, LiveRegs, + AMDGPU::SReg_64_XEXECRegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy) + .addImm(-1); + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, + Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, + &TII->getRegisterInfo()); + } + + // FIXME: Split block and make terminator. + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(ScratchExecCopy); } unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); @@ -645,8 +687,6 @@ const MachineFrameInfo &MFI = MF.getFrameInfo(); uint32_t NumBytes = MFI.getStackSize(); - DebugLoc DL; - // FIXME: Clarify distinction between no set SP and SP. For callee functions, // it's really whether we need SP to be accurate or not. Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -30,11 +30,11 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf: ; GCN: s_mov_b32 s5, s32 +; GCN: s_add_u32 s32, s32, 0xc00{{$}} ; GCN-DAG: buffer_store_dword v32 ; GCN-DAG: buffer_store_dword v33 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 ; GCN-DAG: v_writelane_b32 -; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] ; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}} Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- test/CodeGen/AMDGPU/call-preserved-registers.ll +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -38,8 +38,8 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN: v_readlane_b32 s37, v32, 4 +; GCN-DAG: s_mov_b32 s5, s33 +; GCN-DAG: v_readlane_b32 s37, v32, 4 ; GCN: v_readlane_b32 s36, v32, 3 ; GCN: v_readlane_b32 s35, v32, 2 ; GCN: v_readlane_b32 s34, v32, 1 @@ -59,7 +59,7 @@ ; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s5, s33 +; GCN: s_mov_b32 s5, s33 define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() @@ -175,7 +175,7 @@ ; GCN-NEXT: ; clobber ; GCN-NEXT: #ASMEND ; GCN-NEXT: v_readlane_b32 s33, v0, 0 -; GCN-NEXT: s_setpc_b64 +; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s33() #2 { call void asm sideeffect "; clobber", "~{s33}"() #0 ret void Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -37,19 +37,19 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN: s_mov_b32 s5, s32 +; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; GCN-DAG: v_writelane_b32 v32, s33, ; GCN-DAG: v_writelane_b32 v32, s34, ; GCN-DAG: v_writelane_b32 v32, s35, -; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} ; GCN-DAG: s_mov_b32 s33, s5 ; GCN: s_swappc_b64 -; GCN: s_mov_b32 s5, s33 +; GCN-DAG: s_mov_b32 s5, s33 ; GCN-DAG: v_readlane_b32 s35, ; GCN-DAG: v_readlane_b32 s34, ; GCN-DAG: v_readlane_b32 s33, @@ -72,7 +72,9 @@ ; GCN-LABEL: {{^}}callee_no_stack_with_call: ; GCN: s_waitcnt ; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-DAG: v_writelane_b32 v32, s33, 0 ; GCN-DAG: v_writelane_b32 v32, s34, 1 ; GCN: s_mov_b32 s33, s5 @@ -81,9 +83,12 @@ ; GCN-DAG: v_readlane_b32 s34, v32, 1 ; GCN-DAG: v_readlane_b32 s33, v32, 0 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 -; GCN: s_sub_u32 s32, s32, 0x400 +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] + +; GCN: s_sub_u32 s32, s32, 0x400 ; GCN: s_setpc_b64 define void @callee_no_stack_with_call() #0 { call void @external_void_func_void() @@ -94,11 +99,18 @@ ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] + ; GCN: v_writelane_b32 v32 ; GCN: ;;#ASMSTART ; GCN: v_readlane_b32 s{{[0-9]+}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload + +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] + ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -326,8 +326,8 @@ ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill ; GCN: s_add_u32 s32, s32, 0x400{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}} Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -28,10 +28,12 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: v_writelane_b32 v32, s35, 2 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func_v2f32@rel32@lo+4 @@ -39,12 +41,14 @@ ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] ; GCN-NEXT: v_readlane_b32 s35, v32, 2 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: v_readlane_b32 s34, v32, 1 ; GCN-NEXT: v_readlane_b32 s33, v32, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -62,10 +66,12 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: v_writelane_b32 v32, s35, 2 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func_v3f32@rel32@lo+4 @@ -73,12 +79,14 @@ ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] ; GCN-NEXT: v_readlane_b32 s35, v32, 2 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: v_readlane_b32 s34, v32, 1 ; GCN-NEXT: v_readlane_b32 s33, v32, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -96,10 +104,12 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: v_writelane_b32 v32, s35, 2 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func_v4f16@rel32@lo+4 @@ -107,12 +117,14 @@ ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] ; GCN-NEXT: v_readlane_b32 s35, v32, 2 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: v_readlane_b32 s34, v32, 1 ; GCN-NEXT: v_readlane_b32 s33, v32, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -130,10 +142,12 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 -; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: v_writelane_b32 v32, s35, 2 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func_struct@rel32@lo+4 @@ -141,13 +155,15 @@ ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] ; GCN-NEXT: v_readlane_b32 s35, v32, 2 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: v_readlane_b32 s34, v32, 1 +; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: v_readlane_b32 s33, v32, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: v_mov_b32_e32 v1, v4 +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] Index: test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- test/CodeGen/AMDGPU/nested-calls.ll +++ test/CodeGen/AMDGPU/nested-calls.ll @@ -10,9 +10,12 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm: ; GCN: s_waitcnt ; GCN: s_mov_b32 s5, s32 -; Spill CSR VGPR used for SGPR spilling -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; GCN-DAG: s_add_u32 s32, s32, 0x400 +; Spill CSR VGPR used for SGPR spilling +; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] + ; GCN-DAG: v_writelane_b32 v32, s33, 0 ; GCN-DAG: v_writelane_b32 v32, s34, 1 ; GCN-DAG: v_writelane_b32 v32, s35, 2 @@ -22,7 +25,10 @@ ; GCN: v_readlane_b32 s35, v32, 2 ; GCN: v_readlane_b32 s34, v32, 1 ; GCN: v_readlane_b32 s33, v32, 0 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] + ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm() #0 { Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -207,13 +207,17 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12 +; GCN: s_add_u32 s32, s32, 0x400 + +; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:12 +; GCN-NEXT: s_mov_b64 exec + ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill ; GCN-DAG: v_writelane_b32 v34, s33, 0 ; GCN-DAG: v_writelane_b32 v34, s34, 1 ; GCN-DAG: v_writelane_b32 v34, s35, 2 -; GCN-DAG: s_add_u32 s32, s32, 0x400 ; GCN-DAG: s_getpc_b64 ; GCN: s_swappc_b64 @@ -228,7 +232,10 @@ ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 -; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12 +; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:12 +; GCN-NEXT: s_mov_b64 exec + ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN: s_setpc_b64 s[6:7] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {