Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1671,6 +1671,18 @@ InFlag = SDValue(); } + // For thumb1 targets, if R3 is used for argument passing, we need + // to place the call target address in IP (i.e. R12). + bool IsR3UsedForArgumentPassing = false; + if (RegsToPass.size() >= 4) { + IsR3UsedForArgumentPassing = true; + } + + bool ForceCallAddrToRegR12 = false; + + if (isTailCall && IsR3UsedForArgumentPassing && Subtarget->isThumb1Only() ) + ForceCallAddrToRegR12 = true; + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. @@ -1679,10 +1691,12 @@ bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo(); - if (EnableARMLongCalls) { + if (EnableARMLongCalls || (isTailCall && Subtarget->isThumb1Only() )) { assert((Subtarget->isTargetWindows() || + (isTailCall && Subtarget->isThumb1Only()) || getTargetMachine().getRelocationModel() == Reloc::Static) && "long-calls with non-static relocation model!"); + // Handle a global address or an external symbol. If it's not one of // those, the target's already in a register, so we don't need to do // anything extra. @@ -1785,6 +1799,12 @@ } } + if (ForceCallAddrToRegR12) { + Chain = DAG.getCopyToReg(Chain, dl, ARM::R12, + Callee,Chain.getValue(1)); + Callee = DAG.getRegister (ARM::R12,getPointerTy()); + } + // FIXME: handle tail calls differently. unsigned CallOpc; bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute( @@ -2000,26 +2020,6 @@ if (isCalleeStructRet || isCallerStructRet) return false; - // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo:: - // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as - // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation - // support in the assembler and linker to be used. This would need to be - // fixed to fully support tail calls in Thumb1. - // - // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take - // LR. This means if we need to reload LR, it takes an extra instructions, - // which outweighs the value of the tail call; but here we don't know yet - // whether LR is going to be used. Probably the right approach is to - // generate the tail call here and turn it back into CALL/RET in - // emitEpilogue if LR is used. - - // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, - // but we need to make sure there are enough registers; the only valid - // registers are the 4 used for parameters. We don't currently do this - // case. - if (Subtarget->isThumb1Only()) - return false; - // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls @@ -2365,7 +2365,7 @@ if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; - return !Subtarget->isThumb1Only(); + return true; } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -262,7 +262,7 @@ SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0); } else { IsR9Reserved = ReserveR9; - SupportsTailCall = !isThumb1Only(); + SupportsTailCall = true; } if (Align == DefaultAlign) { Index: lib/Target/ARM/Thumb1FrameLowering.cpp =================================================================== --- lib/Target/ARM/Thumb1FrameLowering.cpp +++ lib/Target/ARM/Thumb1FrameLowering.cpp @@ -235,7 +235,6 @@ } } - // Adjust FP so it point to the stack slot that contains the previous FP. if (HasFP) { FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) @@ -323,11 +322,18 @@ } void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { + MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert((MBBI->getOpcode() == ARM::tBX_RET || - MBBI->getOpcode() == ARM::tPOP_RET) && - "Can only insert epilog into returning blocks"); + MBBI->getOpcode() == ARM::tPOP_RET || + MBBI->getOpcode() == ARM::TCRETURNri) + && "Can only insert epilog into returning blocks " + "and tail calls with address in regs."); + + bool IsTailCallReturn = false; + if (MBBI->getOpcode() == ARM::TCRETURNri) + IsTailCallReturn = true; + DebugLoc dl = MBBI->getDebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -351,8 +357,8 @@ if (NumBytes - ArgRegsSaveSize != 0) emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize); } else { - // Unwind MBBI to point to first LDR / VLDRD. - if (MBBI != MBB.begin()) { + // Unwind MBBI to point to first LDR / VLDRD. Not for tail call returns! + if ((MBBI != MBB.begin()) && (!IsTailCallReturn)) { do --MBBI; while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); @@ -390,16 +396,162 @@ MachineBasicBlock::iterator PMBBI = std::prev(MBBI); if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes)) emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); - } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes)) + } else if (IsTailCallReturn) { + // Don't try to fold SP update into push pop for tail call returns. + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); + } + else if (!tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes)) emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); } } - bool IsV4PopReturn = false; - for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo()) + bool IsR4InCSI = false; + bool IsR7InCSI = false; + bool IsLRInCSI = false; + + for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo()) { + if (CSI.getReg() == ARM::R4) + IsR4InCSI = true; + if (CSI.getReg() == ARM::R7) + IsR7InCSI = true; if (CSI.getReg() == ARM::LR) - IsV4PopReturn = true; - IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps(); + IsLRInCSI = true; + } + + bool IsV4PopReturn = IsLRInCSI && STI.hasV4TOps() && !STI.hasV5TOps(); + + if (IsTailCallReturn) { + MBBI = MBB.getLastNonDebugInstr(); + + // First restore callee saved registers. Unlike for normal returns + // this is *not* done in restoreCalleeSavedRegisters. + const std::vector &CSI(MFI->getCalleeSavedInfo()); + + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + + // We need to additionally push/pop R4 in case that LR reconstruction + // for tail calls requires R4 as scratch register. + bool MustRestoreR4 = false; + + bool IsR3AvailableAsSpill = true; + + for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { + MachineOperand &Operand = MBBI->getOperand(i); + if (Operand.isReg()) { + if (Operand.getReg() == ARM::R3) + IsR3AvailableAsSpill = false; + } + } + + if (IsLRInCSI && ! IsR3AvailableAsSpill) { + // We need to restore LR before pop + // and need another scratch register for this purpose + int StackSlotForSavedLR = CSI.size() - 1; + assert (StackSlotForSavedLR >= 0 && "Wrong Stack slot for LR."); + + unsigned LRRestoreReg; + + // Make sure that R4/R7 or R3 may be used as scratch. + // Arrange for an additional tPUSH (R4) and pop {r4, ...} if necessary. + if (IsR4InCSI) + LRRestoreReg = ARM::R4; + else if (IsR7InCSI) + LRRestoreReg = ARM::R7; + else { + MustRestoreR4 = true; + LRRestoreReg = ARM::R4; + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPUSH))) + .addReg(ARM::R4,RegState::Kill); + + StackSlotForSavedLR ++; + } + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRspi)) + .addReg(LRRestoreReg, RegState::Define) + .addReg(ARM::SP) + .addImm(StackSlotForSavedLR)); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::LR, RegState::Define) + .addReg(LRRestoreReg, RegState::Kill)); + } + + MachineInstrBuilder MIB = BuildMI(MF, dl, TII.get(ARM::tPOP)); + AddDefaultPred(MIB); + + bool EmptyPop = true; + + if (MustRestoreR4) { + MIB.addReg(ARM::R4, getDefRegState(true)); + EmptyPop = false; + } + + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + + if (Reg == ARM::R4 && MustRestoreR4) + continue; + + if (Reg == ARM::LR) + continue; + + MIB.addReg(Reg, getDefRegState(true)); + EmptyPop = false; + } + + // It's illegal to emit pop instruction without operands. + if (EmptyPop) + MF.DeleteMachineInstr(MIB); + else + MBB.insert(MBBI, &*MIB); + + if (IsLRInCSI) { + const Thumb1RegisterInfo *RegInfo = + static_cast + (MF.getSubtarget().getRegisterInfo()); + + if (IsR3AvailableAsSpill) { + // Restore LR after pop possible. + + MachineInstrBuilder MIB = BuildMI(MF, dl, TII.get(ARM::tPOP)); + AddDefaultPred(MIB); + MIB.addReg(ARM::R3, getDefRegState(true)); + MBB.insert(MBBI, &*MIB); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::LR, RegState::Define) + .addReg(ARM::R3, RegState::Kill)); + + if (ArgRegsSaveSize) { + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + } + + } else { + // Re-adjust stack pointer for LR content still residing on the stack. + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, 4 + ArgRegsSaveSize); + } + } + + MachineOperand &JumpTarget = MBBI->getOperand(0); + + assert (MBBI->getOpcode() == ARM::TCRETURNri); + DebugLoc dl = MBBI->getDebugLoc(); + + BuildMI(MBB, MBBI, dl, + TII.get(ARM::tTAILJMPr)) + .addReg(JumpTarget.getReg(), RegState::Kill); + + MachineInstr *NewMI = std::prev(MBBI); + for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + MBBI = NewMI; + return; + } // Unlike T2 and ARM mode, the T1 pop instruction cannot restore // to LR, and we can't pop the value directly to the PC since @@ -501,19 +653,25 @@ MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { + + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + + if(MBBI->getOpcode() == ARM::TCRETURNri) + return true; // Handle pop generation in emitEpliogue + if (CSI.empty()) return false; MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - bool isVarArg = AFI->getArgRegsSaveSize() > 0; DebugLoc DL = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP)); AddDefaultPred(MIB); - bool NumRegs = false; + bool IsEmptyPop = true; for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (Reg == ARM::LR) { @@ -529,14 +687,14 @@ MI = MBB.erase(MI); } MIB.addReg(Reg, getDefRegState(true)); - NumRegs = true; + IsEmptyPop = false; } // It's illegal to emit pop instruction without operands. - if (NumRegs) - MBB.insert(MI, &*MIB); - else + if (IsEmptyPop) MF.DeleteMachineInstr(MIB); + else + MBB.insert(MI, &*MIB); return true; } Index: lib/Target/ARM/Thumb1RegisterInfo.cpp =================================================================== --- lib/Target/ARM/Thumb1RegisterInfo.cpp +++ lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -417,12 +417,33 @@ // Thumb1 can't use the emergency spill slot on the stack because // ldr/str immediate offsets must be positive, and if we're referencing // off the frame pointer (if, for example, there are alloca() calls in - // the function, the offset will be negative. Use R12 instead since that's - // a call clobbered register that we know won't be used in Thumb1 mode. + // the function, the offset will be negative. + // We need a register as emergency spill slot. + // Use candidates are R12 and LR. R12 might be used in tail calls + // and LR might be used if @llvm.returnaddress is taken. + // Both are call clobbered register that otherwise won't be used in + // Thumb1 mode. + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + bool IsLRInCSI = false; + for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo()) { + if (CSI.getReg() == ARM::LR) + IsLRInCSI = true; + } + + unsigned ScavengeReg = ARM::R12; + if (IsLRInCSI && MBB.isLiveIn(ScavengeReg) && !MFI->isReturnAddressTaken()) + ScavengeReg = ARM::LR; + + assert(!MBB.isLiveIn(ScavengeReg) && + "No Scavenge register available."); + const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo(); DebugLoc DL; AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr)) - .addReg(ARM::R12, RegState::Define) + .addReg(ScavengeReg, RegState::Define) .addReg(Reg, RegState::Kill)); // The UseMI is where we would like to restore the register. If there's @@ -435,7 +456,7 @@ // If this instruction affects R12, adjust our restore point. for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { const MachineOperand &MO = II->getOperand(i); - if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) { + if (MO.isRegMask() && MO.clobbersPhysReg(ScavengeReg)) { UseMI = II; done = true; break; @@ -443,7 +464,7 @@ if (!MO.isReg() || MO.isUndef() || !MO.getReg() || TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue; - if (MO.getReg() == ARM::R12) { + if (MO.getReg() == ScavengeReg) { UseMI = II; done = true; break; @@ -452,7 +473,7 @@ } // Restore the register from R12 AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)). - addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill)); + addReg(Reg, RegState::Define).addReg(ScavengeReg, RegState::Kill)); return true; } Index: test/CodeGen/ARM/fourParametersTailCall_v6m.ll =================================================================== --- test/CodeGen/ARM/fourParametersTailCall_v6m.ll +++ test/CodeGen/ARM/fourParametersTailCall_v6m.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=thumbv6m-none--eabi -O3 %s -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +define void @hugo(i32 %a, i32 %b, i32 %c, i32 %d) { + tail call void @peter(i32 %a, i32 %b, i32 %c, i32 %d) + ret void +; CHECK: ldr r4, .LCPI0_0 +; CHECK: mov r12, r4 +; CHECK: ldr r4, [sp, #4] +; CHECK: mov lr, r4 +; CHECK: pop {r4} +; CHECK: add sp, #4 +; CHECK: bx r12 +; CHECK: .long peter +} + +declare void @peter(i32, i32, i32, i32) + +define i64 @hugo64(i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + %call = tail call i64 @peter64(i32 %a, i32 %b, i32 %c, i32 %d) + ret i64 %call +; CHECK: ldr r4, .LCPI +; CHECK: mov r12, r4 +; CHECK: ldr r4, [sp, #4] +; CHECK: mov lr, r4 +; CHECK: pop {r4} +; CHECK: add sp, #4 +; CHECK: bx r12 +; CHECK: .long peter64 +} + +declare i64 @peter64(i32, i32, i32, i32) Index: test/CodeGen/ARM/threeParametersTailCall_v6m.ll =================================================================== --- test/CodeGen/ARM/threeParametersTailCall_v6m.ll +++ test/CodeGen/ARM/threeParametersTailCall_v6m.ll @@ -0,0 +1,47 @@ +; RUN: llc -mtriple=thumbv6m-none--eabi -O3 %s -o - | FileCheck %s + +; ModuleID = 'threeParameters.c' +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +define void @hugo(i32 %a, i32 %b, i32 %c) { + tail call void @peter(i32 %a, i32 %b, i32 %c) + ret void +; CHECK: ldr r3, .LCPI0_0 +; CHECK: bx r3 +; CHECK: .long peter +} + +declare void @peter(i32, i32, i32) + +define i64 @hugo64(i32 %a, i32 %b, i32 %c) { + %call = tail call i64 @peter64(i32 %a, i32 %b, i32 %c) + ret i64 %call +; CHECK: ldr r3, .LCPI +; CHECK: bx r3 +; CHECK: .long peter64 +} + +declare i64 @peter64(i32, i32, i32) + + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +define void @paul(i32 %a, i32 %b, i32 %c) { + %1 = tail call i32 @otto() + tail call void @anna(i32 %1, i32 %1, i32 %1) + ret void +; CHECK: ldr r3, .LCPI +; CHECK: ldr r7, [sp, #4] +; CHECK: mov lr, r7 +; CHECK: pop {r7} +; CHECK: add sp, #4 +; CHECK: bx r3 +; CHECK: .long anna +} + +declare i32 @otto() + +declare void @anna(i32, i32, i32) + Index: test/CodeGen/ARM/twoParametersTailCall_v6m.ll =================================================================== --- test/CodeGen/ARM/twoParametersTailCall_v6m.ll +++ test/CodeGen/ARM/twoParametersTailCall_v6m.ll @@ -0,0 +1,20 @@ +; RUN: llc -mtriple=thumbv6m-none--eabi -O3 %s -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-none--eabi" + +define void @hugo(i32 %a, i32 %b, i32 %c) { +entry: + tail call void @nonTailCall() + tail call void @peter(i32 %a, i32 %b) + ret void +; CHECK: pop {r3} +; CHECK: mov lr, r3 +; CHECK: bx r2 +; CHECK: .long peter +} + +declare void @nonTailCall() + +declare void @peter(i32, i32) +