Index: lib/Target/AMDGPU/AMDGPURegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -59,16 +59,7 @@ const MCPhysReg * SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { - // FIXME - static MCPhysReg Regs[2]; - - const SIMachineFunctionInfo *MFI = MF->getInfo(); - assert(!MFI->isEntryFunction()); - - Regs[0] = MFI->getFrameOffsetReg(); - Regs[1] = AMDGPU::NoRegister; - - return Regs; + return nullptr; } const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2148,6 +2148,8 @@ MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector, 8> RegsToPass; + SDValue CallerSavedFP; + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) { @@ -2164,6 +2166,13 @@ SDValue ScratchWaveOffsetReg = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); + + if (!Info->isEntryFunction()) { + // Avoid clobbering this function's FP value. In the current convention + // callee will overwrite this, so do save/restore around the call site. + CallerSavedFP = DAG.getCopyFromReg(Chain, DL, + Info->getFrameOffsetReg(), MVT::i32); + } } // Stack pointer relative accesses are done by changing the offset SGPR. This @@ -2344,6 +2353,12 @@ Chain = Call.getValue(0); InFlag = Call.getValue(1); + if (CallerSavedFP) { + SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); + Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); + InFlag = Chain.getValue(1); + } + uint64_t CalleePopBytes = 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32), DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- test/CodeGen/AMDGPU/call-preserved-registers.ll +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -32,11 +32,13 @@ ; GCN: v_writelane_b32 v32, s37, 4 ; GCN: s_mov_b32 s33, s5 -; GCN: s_swappc_b64 +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_swappc_b64 -; GCN: s_mov_b32 s5, s33 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN: v_readlane_b32 s37, v32, 4 ; GCN: v_readlane_b32 s36, v32, 3 ; GCN: v_readlane_b32 s35, v32, 2 @@ -50,6 +52,20 @@ ret void } +; FIXME: Avoid extra restore of FP in between calls. +; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: +; GCN: s_mov_b32 s33, s5 +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_mov_b32 s5, s33 +; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_mov_b32 s5, s33 +define void @test_func_call_external_void_funcx2() #0 { + call void @external_void_func_void() + call void @external_void_func_void() + ret void +} + ; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31: ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b64 [[SAVEPC:s\[[0-9]+:[0-9]+\]]], s[30:31] Index: test/CodeGen/AMDGPU/ipra.ll =================================================================== --- test/CodeGen/AMDGPU/ipra.ll +++ test/CodeGen/AMDGPU/ipra.ll @@ -90,5 +90,19 @@ ret void } +define void @void_func_void() noinline { + ret void +} + +; Make sure we don't get save/restore of FP between calls. +; GCN-LABEL: {{^}}test_funcx2: +; GCN-NOT: s5 +; GCN-NOT: s32 +define void @test_funcx2() #0 { + call void @void_func_void() + call void @void_func_void() + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind noinline } Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -13,8 +13,8 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_mov_b32 s5, s32 ; GCN: v_add_i32_e32 v0, vcc, v1, v +; GCN: s_mov_b32 s5, s32 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24 ; GCN: s_waitcnt vmcnt(0) ; GCN: s_setpc_b64