Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -2306,13 +2306,30 @@ assert(Subtarget->supportsTailCall()); - // Tail calls to function pointers cannot be optimized for Thumb1 if the args - // to the call take up r0-r3. The reason is that there are no legal registers - // left to hold the pointer to the function to be called. - if (Subtarget->isThumb1Only() && Outs.size() >= 4 && - !isa(Callee.getNode())) + if (Subtarget->isThumb1Only()) { + // FIXME: This approximation isn't right for non-ELF targets. + if (!Subtarget->isTargetELF()) + return false; + bool IsImmediateCall = isa(Callee.getNode()) || + isa(Callee.getNode()); + + // Tail calls to function pointers cannot be optimized for Thumb1 if the + // args to the call take up r0-r3. The reason is that there are no legal + // registers left to hold the pointer to the function to be called. r12 is + // free, but it would be tricky to emit the right sequence because LLVM + // doesn't treat it as allocatable. + if (Outs.size() >= 4 && (!Subtarget->hasV8MBaselineOps() || + !IsImmediateCall)) return false; + // Don't try to emit a tail call on Thumb1 if the callee is a known global; + // we would be forced to load the address to a GPR. v8m is the exception: it + // supports the required immediate branch. (This restriction shouldn't be + // necessary for correctness; it's just a codesize optimization.) + if (!Subtarget->hasV8MBaselineOps() && IsImmediateCall) + return false; + } + // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -2373,6 +2390,12 @@ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); if (CCInfo.getNextStackOffset()) { + // On Thumb1, don't tail call functions which pass data on the stack; + // we need to be able to reverse the transform after isel to avoid + // increasing codesize. (See + // Thumb1FrameLowering::restoreCalleeSavedRegisters.) + if (Subtarget->isThumb1Only()) + return false; // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo &MFI = MF.getFrameInfo(); Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -206,28 +206,10 @@ if (isTargetNaCl() || isAAPCS16_ABI()) stackAlignment = 16; - // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: - // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as - // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation - // support in the assembler and linker to be used. This would need to be - // fixed to fully support tail calls in Thumb1. - // - // For ARMv8-M, we /do/ implement tail calls. Doing this is tricky for v8-M - // baseline, since the LDM/POP instruction on Thumb doesn't take LR. This - // means if we need to reload LR, it takes extra instructions, which outweighs - // the value of the tail call; but here we don't know yet whether LR is going - // to be used. We take the optimistic approach of generating the tail call and - // perhaps taking a hit if we need to restore the LR. - - // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, - // but we need to make sure there are enough registers; the only valid - // registers are the 4 used for parameters. We don't currently do this - // case. - - SupportsTailCall = !isThumb() || hasV8MBaselineOps(); - if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0)) SupportsTailCall = false; + else + SupportsTailCall = true; switch (IT) { case DefaultIT: Index: lib/Target/ARM/Thumb1FrameLowering.cpp =================================================================== --- lib/Target/ARM/Thumb1FrameLowering.cpp +++ lib/Target/ARM/Thumb1FrameLowering.cpp @@ -962,15 +962,10 @@ if (Reg == ARM::LR) { Info.setRestored(false); - if (!MBB.succ_empty() || - MI->getOpcode() == ARM::TCRETURNdi || - MI->getOpcode() == ARM::TCRETURNri) + if (!MBB.succ_empty()) // LR may only be popped into PC, as part of return sequence. // If this isn't the return sequence, we'll need emitPopSpecialFixUp // to restore LR the hard way. - // FIXME: if we don't pass any stack arguments it would be actually - // advantageous *and* correct to do the conversion to an ordinary call - // instruction here. continue; // Special epilogue for vararg functions. See emitEpilogue if (isVarArg) @@ -978,6 +973,20 @@ // ARMv4T requires BX, see emitEpilogue if (!STI.hasV5TOps()) continue; + // Tailcall optimization failed; change TCRETURN to a tBL. We could + // fall back to emitPopSpecialFixUp instead, but that's more code. + // Assumes tail-calls don't pass any arguments on the stack; + // this is enforced in + // ARMTargetLowering::IsEligibleForTailCallOptimization. + if (MI->getOpcode() == ARM::TCRETURNdi || + MI->getOpcode() == ARM::TCRETURNri) { + unsigned Opcode = MI->getOpcode() == ARM::TCRETURNdi + ? ARM::tBL : ARM::tBLXr; + MachineInstrBuilder BL = BuildMI(MF, DL, TII.get(Opcode)); + BL.add(predOps(ARMCC::AL)); + BL.add(MI->getOperand(0)); + MBB.insert(MI, &*BL); + } // Pop LR into PC. Reg = ARM::PC; Index: test/CodeGen/ARM/thumb_indirect_calls.ll =================================================================== --- test/CodeGen/ARM/thumb_indirect_calls.ll +++ test/CodeGen/ARM/thumb_indirect_calls.ll @@ -4,7 +4,7 @@ @f = common global void (i32)* null, align 4 ; CHECK-LABEL: foo: -define void @foo(i32 %x) { +define void @foo(i32 %x) "disable-tail-calls"="true" { entry: %0 = load void (i32)*, void (i32)** @f, align 4 tail call void %0(i32 %x) @@ -22,7 +22,7 @@ } ; CHECK-LABEL: bar: -define void @bar(void (i32)* nocapture %g, i32 %x, void (i32)* nocapture %h) { +define void @bar(void (i32)* nocapture %g, i32 %x, void (i32)* nocapture %h) "disable-tail-calls"="true" { entry: tail call void %g(i32 %x) tail call void %h(i32 %x) Index: test/CodeGen/ARM/v8m-tail-call.ll =================================================================== --- test/CodeGen/ARM/v8m-tail-call.ll +++ test/CodeGen/ARM/v8m-tail-call.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc %s -o - -mtriple=thumbv8m.base | FileCheck %s +; RUN: llc %s -o - -mtriple=thumbv6m-eabi | FileCheck %s -check-prefix=V6M declare i32 @g(...) @@ -12,11 +13,19 @@ ; CHECK-NEXT: movs r1, #1 ; CHECK-NEXT: movs r2, #2 ; CHECK-NEXT: movs r3, #3 -; CHECK-NEXT: ldr r7, [sp, #4] -; CHECK-NEXT: mov lr, r7 -; CHECK-NEXT: pop {r7} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: b h0 +; CHECK-NEXT: bl h0 +; CHECK-NEXT: pop {r7, pc} +; +; V6M-LABEL: f0: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl g +; V6M-NEXT: movs r1, #1 +; V6M-NEXT: movs r2, #2 +; V6M-NEXT: movs r3, #3 +; V6M-NEXT: bl h0 +; V6M-NEXT: pop {r7, pc} %1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %2 = tail call i32 @h0(i32 %1, i32 1, i32 2, i32 3) ret i32 %2 @@ -28,10 +37,16 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bl g -; CHECK-NEXT: pop {r7} -; CHECK-NEXT: pop {r1} -; CHECK-NEXT: mov lr, r1 -; CHECK-NEXT: b h1 +; CHECK-NEXT: bl h1 +; CHECK-NEXT: pop {r7, pc} +; +; V6M-LABEL: f1: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: bl g +; V6M-NEXT: bl h1 +; V6M-NEXT: pop {r7, pc} %1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %2 = tail call i32 @h1(i32 %1) ret i32 %2 @@ -42,24 +57,55 @@ ; CHECK-LABEL: f2: ; CHECK: @ %bb.0: ; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: mov r4, r3 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl g ; CHECK-NEXT: cbz r0, .LBB2_2 ; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: ldr r1, [sp, #24] +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: ldr r4, [sp, #12] -; CHECK-NEXT: mov lr, r4 -; CHECK-NEXT: pop {r4, r5, r6} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: b h2 +; CHECK-NEXT: bl h2 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: mvns r0, r0 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop {r4, r5, r6, pc} +; +; V6M-LABEL: f2: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, r5, r6, lr} +; V6M-NEXT: push {r4, r5, r6, lr} +; V6M-NEXT: .pad #8 +; V6M-NEXT: sub sp, #8 +; V6M-NEXT: mov r4, r3 +; V6M-NEXT: mov r5, r2 +; V6M-NEXT: mov r6, r1 +; V6M-NEXT: bl g +; V6M-NEXT: cmp r0, #0 +; V6M-NEXT: beq .LBB2_2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: ldr r1, [sp, #24] +; V6M-NEXT: mov r2, sp +; V6M-NEXT: str r1, [r2] +; V6M-NEXT: mov r1, r6 +; V6M-NEXT: mov r2, r5 +; V6M-NEXT: mov r3, r4 +; V6M-NEXT: bl h2 +; V6M-NEXT: add sp, #8 +; V6M-NEXT: pop {r4, r5, r6, pc} +; V6M-NEXT: .LBB2_2: +; V6M-NEXT: movs r0, #0 +; V6M-NEXT: mvns r0, r0 +; V6M-NEXT: add sp, #8 +; V6M-NEXT: pop {r4, r5, r6, pc} %6 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %7 = icmp eq i32 %6, 0 br i1 %7, label %10, label %8 @@ -87,6 +133,23 @@ ; CHECK-NEXT: movs r3, #4 ; CHECK-NEXT: blx r4 ; CHECK-NEXT: pop {r4, pc} +; +; V6M-LABEL: test3: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: ldr r0, .LCPI3_0 +; V6M-NEXT: ldr r4, [r0] +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r1, #2 +; V6M-NEXT: movs r2, #3 +; V6M-NEXT: movs r3, #4 +; V6M-NEXT: blx r4 +; V6M-NEXT: pop {r4, pc} +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI3_0: +; V6M-NEXT: .long fnptr %1 = load i32 (i32, i32, i32, i32)*, i32 (i32, i32, i32, i32)** @fnptr %2 = tail call i32 %1(i32 1, i32 2, i32 3, i32 4) ret i32 %2 @@ -106,6 +169,23 @@ ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: blx r4 ; CHECK-NEXT: pop {r4, pc} +; +; V6M-LABEL: test4: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: ldr r0, .LCPI4_0 +; V6M-NEXT: ldr r4, [r0] +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r1, #2 +; V6M-NEXT: movs r2, #3 +; V6M-NEXT: movs r3, #0 +; V6M-NEXT: blx r4 +; V6M-NEXT: pop {r4, pc} +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI4_0: +; V6M-NEXT: .long fnptr2 %1 = load i32 (i32, i32, i64)*, i32 (i32, i32, i64)** @fnptr2 %2 = tail call i32 %1(i32 1, i32 2, i64 3) ret i32 %2 @@ -124,6 +204,18 @@ ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: movs r1, #2 ; CHECK-NEXT: bx r2 +; +; V6M-LABEL: test5: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r0, .LCPI5_0 +; V6M-NEXT: ldr r2, [r0] +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r1, #2 +; V6M-NEXT: bx r2 +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI5_0: +; V6M-NEXT: .long fnptr3 %1 = load i32 (i32, i32)*, i32 (i32, i32)** @fnptr3 %2 = tail call i32 %1(i32 1, i32 2) ret i32 %2 @@ -141,6 +233,19 @@ ; CHECK-NEXT: movs r2, #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: bx r1 +; +; V6M-LABEL: test6: +; V6M: @ %bb.0: +; V6M-NEXT: ldr r0, .LCPI6_0 +; V6M-NEXT: ldr r1, [r0] +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r2, #2 +; V6M-NEXT: movs r3, #0 +; V6M-NEXT: bx r1 +; V6M-NEXT: .p2align 2 +; V6M-NEXT: @ %bb.1: +; V6M-NEXT: .LCPI6_0: +; V6M-NEXT: .long fnptr4 %1 = load i32 (i32, i64)*, i32 (i32, i64)** @fnptr4 %2 = tail call i32 %1(i32 1, i64 2) ret i32 %2 @@ -156,6 +261,17 @@ ; CHECK-NEXT: movs r2, #3 ; CHECK-NEXT: movs r3, #4 ; CHECK-NEXT: b bar +; +; V6M-LABEL: test7: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r7, lr} +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r1, #2 +; V6M-NEXT: movs r2, #3 +; V6M-NEXT: movs r3, #4 +; V6M-NEXT: bl bar +; V6M-NEXT: pop {r7, pc} %tail = tail call i32 @bar(i32 1, i32 2, i32 3, i32 4) ret i32 %tail } @@ -188,12 +304,36 @@ ; CHECK-NEXT: movs r1, #2 ; CHECK-NEXT: movs r2, #3 ; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload +; CHECK-NEXT: blx r3 ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: ldr r4, [sp, #16] -; CHECK-NEXT: mov lr, r4 -; CHECK-NEXT: pop {r4, r5, r6, r7} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx r3 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; +; V6M-LABEL: test8: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: .save {r4, r5, r6, r7, lr} +; V6M-NEXT: push {r4, r5, r6, r7, lr} +; V6M-NEXT: .pad #4 +; V6M-NEXT: sub sp, #4 +; V6M-NEXT: mov r4, r1 +; V6M-NEXT: str r0, [sp] @ 4-byte Spill +; V6M-NEXT: bl test8_u +; V6M-NEXT: mov r5, r0 +; V6M-NEXT: ldr r6, [r0] +; V6M-NEXT: movs r7, #0 +; V6M-NEXT: mov r0, r7 +; V6M-NEXT: bl test8_h +; V6M-NEXT: mov r1, r0 +; V6M-NEXT: mov r0, r6 +; V6M-NEXT: mov r2, r7 +; V6M-NEXT: bl test8_g +; V6M-NEXT: str r4, [r5] +; V6M-NEXT: movs r0, #1 +; V6M-NEXT: movs r1, #2 +; V6M-NEXT: movs r2, #3 +; V6M-NEXT: ldr r3, [sp] @ 4-byte Reload +; V6M-NEXT: blx r3 +; V6M-NEXT: add sp, #4 +; V6M-NEXT: pop {r4, r5, r6, r7, pc} entry: %call = tail call %struct.S* bitcast (%struct.S* (...)* @test8_u to %struct.S* ()*)() %a = getelementptr inbounds %struct.S, %struct.S* %call, i32 0, i32 0 @@ -225,6 +365,19 @@ ; CHECK-NEXT: ldr r2, [r2] ; CHECK-NEXT: pop {r4, r7} ; CHECK-NEXT: b h9 +; +; V6M-LABEL: test9: +; V6M: @ %bb.0: +; V6M-NEXT: .save {r4, lr} +; V6M-NEXT: push {r4, lr} +; V6M-NEXT: ldr r4, [r3] +; V6M-NEXT: ldr r3, [r3, #4] +; V6M-NEXT: adds r3, r4, r3 +; V6M-NEXT: ldr r1, [r1] +; V6M-NEXT: ldr r0, [r0] +; V6M-NEXT: ldr r2, [r2] +; V6M-NEXT: bl h9 +; V6M-NEXT: pop {r4, pc} %zz = load i32, i32* %z %xx = load i32, i32* %x %yy = load i32, i32* %y