Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1671,6 +1671,26 @@ InFlag = SDValue(); } + // For thumb1 targets, if R3 is used for argument passing, we need + // to place the call target address in IP (i.e. R12). + bool IsR3UsedForArgumentPassing = false; + if (RegsToPass.size() >= 4) { + IsR3UsedForArgumentPassing = true; + } + + bool IsCallAddressMoveToRegisterRequired = false; + bool CallAdressShallBeForcedToHardRegR12 = false; + + if (EnableARMLongCalls || (isTailCall && Subtarget->isThumb1Only() )) + { + IsCallAddressMoveToRegisterRequired = true; + + if (isTailCall + && IsR3UsedForArgumentPassing + && Subtarget->isThumb1Only() ) + CallAdressShallBeForcedToHardRegR12 = true; + } + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. @@ -1679,10 +1699,12 @@ bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo(); - if (EnableARMLongCalls) { + if (IsCallAddressMoveToRegisterRequired) { assert((Subtarget->isTargetWindows() || + (isTailCall && Subtarget->isThumb1Only()) || getTargetMachine().getRelocationModel() == Reloc::Static) && "long-calls with non-static relocation model!"); + // Handle a global address or an external symbol. If it's not one of // those, the target's already in a register, so we don't need to do // anything extra. @@ -1695,11 +1717,14 @@ // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(), false, false, false, 0); + } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { const char *Sym = S->getSymbol(); @@ -1785,6 +1810,12 @@ } } + if (CallAdressShallBeForcedToHardRegR12) { + Chain = DAG.getCopyToReg(Chain, dl, ARM::R12, + Callee,Chain.getValue(1)); + Callee = DAG.getRegister (ARM::R12,getPointerTy()); + } + // FIXME: handle tail calls differently. unsigned CallOpc; bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute( @@ -2000,26 +2031,6 @@ if (isCalleeStructRet || isCallerStructRet) return false; - // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo:: - // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as - // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation - // support in the assembler and linker to be used. This would need to be - // fixed to fully support tail calls in Thumb1. - // - // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take - // LR. This means if we need to reload LR, it takes an extra instructions, - // which outweighs the value of the tail call; but here we don't know yet - // whether LR is going to be used. Probably the right approach is to - // generate the tail call here and turn it back into CALL/RET in - // emitEpilogue if LR is used. - - // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, - // but we need to make sure there are enough registers; the only valid - // registers are the 4 used for parameters. We don't currently do this - // case. - if (Subtarget->isThumb1Only()) - return false; - // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls @@ -2365,7 +2376,7 @@ if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; - return !Subtarget->isThumb1Only(); + return true; } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -262,7 +262,7 @@ SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0); } else { IsR9Reserved = ReserveR9; - SupportsTailCall = !isThumb1Only(); + SupportsTailCall = true; } if (Align == DefaultAlign) { Index: lib/Target/ARM/Thumb1FrameLowering.cpp =================================================================== --- lib/Target/ARM/Thumb1FrameLowering.cpp +++ lib/Target/ARM/Thumb1FrameLowering.cpp @@ -323,11 +323,18 @@ } void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { + MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert((MBBI->getOpcode() == ARM::tBX_RET || - MBBI->getOpcode() == ARM::tPOP_RET) && - "Can only insert epilog into returning blocks"); + MBBI->getOpcode() == ARM::tPOP_RET || + MBBI->getOpcode() == ARM::TCRETURNri) + && "Can only insert epilog into returning blocks " + "and tail calls with address in regs."); + + bool IsTailCallReturn = false; + if (MBBI->getOpcode() == ARM::TCRETURNri) + IsTailCallReturn = true; + DebugLoc dl = MBBI->getDebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -351,8 +358,8 @@ if (NumBytes - ArgRegsSaveSize != 0) emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize); } else { - // Unwind MBBI to point to first LDR / VLDRD. - if (MBBI != MBB.begin()) { + // Unwind MBBI to point to first LDR / VLDRD. Not for tail call returns! + if ((MBBI != MBB.begin()) && (!IsTailCallReturn)) { do --MBBI; while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); @@ -395,12 +402,149 @@ } } - bool IsV4PopReturn = false; - for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo()) + bool IsR4InCSI = false; + bool IsR7InCSI = false; + bool IsLRInCSI = false; + + for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo()) { + if (CSI.getReg() == ARM::R4) + IsR4InCSI = true; + if (CSI.getReg() == ARM::R7) + IsR7InCSI = true; if (CSI.getReg() == ARM::LR) - IsV4PopReturn = true; + IsLRInCSI = true; + } + + bool IsV4PopReturn = IsLRInCSI; IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps(); + if (IsTailCallReturn) { + MBBI = MBB.getLastNonDebugInstr(); + + // First restore callee saved registers. Unlike for normal returns + // this is *not* done in restoreCalleeSavedRegisters. + const std::vector &CSI(MFI->getCalleeSavedInfo()); + + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + + // We need to additionally push/pop R4 in case that LR reconstruction + // for tail calls requires R4 as scratch register. + bool IsR4ToBeAdditionallyAddedToPopIns = false; + + bool IsR3AvailableAsSpill = true; + + for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { + MachineOperand &Operand = MBBI->getOperand(i); + if (Operand.isReg()) { + if (Operand.getReg() == ARM::R3) + IsR3AvailableAsSpill = false; + } + } + + bool IsLRRestoreAfterPop = IsR3AvailableAsSpill; + + if (IsLRInCSI && ! IsLRRestoreAfterPop) { + // We need to restore LR and need a scratch register for this purpose + int StackSlotForSavedLR = CSI.size() - 1; + assert (StackSlotForSavedLR >= 0 && "Wrong Stack slot for LR."); + + unsigned RegToUseForLRRestore; + + // Make sure that R4/R7 or R3 may be used as scratch. + // Arrange for an additional tPUSH (R4) and pop {r4, ...} if necessary. + if (IsR4InCSI) + RegToUseForLRRestore = ARM::R4; + else if (IsR7InCSI) + RegToUseForLRRestore = ARM::R7; + else { + IsR4ToBeAdditionallyAddedToPopIns = true; + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPUSH))) + .addReg(ARM::R4,RegState::Kill); + + StackSlotForSavedLR ++; + } + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRspi)) + .addReg(RegToUseForLRRestore, RegState::Define) + .addReg(ARM::SP) + .addImm(StackSlotForSavedLR)); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::LR, RegState::Define) + .addReg(RegToUseForLRRestore, RegState::Kill)); + } + + MachineInstrBuilder MIB = BuildMI(MF, dl, TII.get(ARM::tPOP)); + AddDefaultPred(MIB); + + bool NumRegs = false; + + if (IsR4ToBeAdditionallyAddedToPopIns) { + MIB.addReg(ARM::R4, getDefRegState(true)); + NumRegs = true; + } + + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + + if (Reg == ARM::R4 && IsR4ToBeAdditionallyAddedToPopIns) + continue; + + if (Reg == ARM::LR) + continue; + + MIB.addReg(Reg, getDefRegState(true)); + NumRegs = true; + } + + // It's illegal to emit pop instruction without operands. + if (NumRegs) + MBB.insert(MBBI, &*MIB); + else + MF.DeleteMachineInstr(MIB); + + if (IsLRInCSI) { + const Thumb1RegisterInfo *RegInfo = + static_cast + (MF.getSubtarget().getRegisterInfo()); + + if (IsLRRestoreAfterPop) { + MachineInstrBuilder MIB = BuildMI(MF, dl, TII.get(ARM::tPOP)); + AddDefaultPred(MIB); + MIB.addReg(ARM::R3, getDefRegState(true)); + MBB.insert(MBBI, &*MIB); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::LR, RegState::Define) + .addReg(ARM::R3, RegState::Kill)); + + } else { + // Re-adjust stack pointer for LR content still residing on the stack. + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, 4); + } + } + + MachineOperand &JumpTarget = MBBI->getOperand(0); + + assert (MBBI->getOpcode() == ARM::TCRETURNri); + DebugLoc dl = MBBI->getDebugLoc(); + + BuildMI(MBB, MBBI, dl, + TII.get(ARM::tTAILJMPr)) + .addReg(JumpTarget.getReg(), RegState::Kill); + + MachineInstr *NewMI = std::prev(MBBI); + for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + MBBI = NewMI; + return; + } + // Unlike T2 and ARM mode, the T1 pop instruction cannot restore // to LR, and we can't pop the value directly to the PC since // we need to update the SP after popping the value. So instead @@ -501,15 +645,25 @@ MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { + + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + bool IsTailCallReturn = false; + if(MBBI->getOpcode() == ARM::TCRETURNri) + IsTailCallReturn = true; + if (CSI.empty()) return false; + // We will handle callee saving in emitEpilogue and not here. + if (IsTailCallReturn) + return true; + MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - bool isVarArg = AFI->getArgRegsSaveSize() > 0; DebugLoc DL = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP)); AddDefaultPred(MIB); @@ -517,12 +671,15 @@ for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (Reg == ARM::LR) { + // Special epilogue for vararg functions. See emitEpilogue if (isVarArg) continue; - // ARMv4T requires BX, see emitEpilogue - if (STI.hasV4TOps() && !STI.hasV5TOps()) + + // ARMv4T require BX, see emitEpilogue + if ((STI.hasV4TOps() && !STI.hasV5TOps())) continue; + Reg = ARM::PC; (*MIB).setDesc(TII.get(ARM::tPOP_RET)); MIB.copyImplicitOps(&*MI); Index: test/CodeGen/ARM/fourParametersTailCall_v6m.ll =================================================================== --- test/CodeGen/ARM/fourParametersTailCall_v6m.ll +++ test/CodeGen/ARM/fourParametersTailCall_v6m.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=thumbv6m-none--eabi -O3 %s -o - | FileCheck %s + +; ModuleID = 'fourParameters.c' +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +; Function Attrs: nounwind +define void @hugo(i32 %a, i32 %b, i32 %c, i32 %d) #0 { + tail call void @peter(i32 %a, i32 %b, i32 %c, i32 %d) #2 + ret void +} + +declare void @peter(i32, i32, i32, i32) #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"clang version 3.6.0 (trunk 224418)"} + +; CHECK: ldr r4, .LCPI0_0 +; CHECK: mov r12, r4 +; CHECK: ldr r4, [sp, #4] +; CHECK: mov lr, r4 +; CHECK: pop {r4} +; CHECK: add sp, #4 +; CHECK: bx r12 +; CHECK: .long peter + Index: test/CodeGen/ARM/threeParametersTailCall_v6m.ll =================================================================== --- test/CodeGen/ARM/threeParametersTailCall_v6m.ll +++ test/CodeGen/ARM/threeParametersTailCall_v6m.ll @@ -0,0 +1,29 @@ +; RUN: llc -mtriple=thumbv6m-none--eabi -O3 %s -o - | FileCheck %s + +; ModuleID = 'threeParameters.c' +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +; Function Attrs: nounwind +define void @hugo(i32 %a, i32 %b, i32 %c) #0 { + tail call void @peter(i32 %a, i32 %b, i32 %c) #2 + ret void +} + +declare void @peter(i32, i32, i32) #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"clang version 3.6.0 (trunk 224418)"} + +; CHECK: ldr r3, .LCPI0_0 +; CHECK: bx r3 +; CHECK: .long peter + Index: test/CodeGen/ARM/twoParametersTailCall_v6m.ll =================================================================== --- test/CodeGen/ARM/twoParametersTailCall_v6m.ll +++ test/CodeGen/ARM/twoParametersTailCall_v6m.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=thumbv6m-none--eabi -O3 %s -o - | FileCheck %s + +; ModuleID = 'twoParameters.c' +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +; Function Attrs: nounwind +define void @hugo(i32 %a, i32 %b, i32 %c) #0 { +entry: + tail call void @nonTailCall() #2 + tail call void @peter(i32 %a, i32 %b) #2 + ret void +} + +declare void @nonTailCall() #1 + +declare void @peter(i32, i32) #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"clang version 3.6.0 (trunk 225589)"} + +; CHECK: pop {r3} +; CHECK: mov lr, r3 +; CHECK: bx r2 +; CHECK: .long peter +