Index: lib/Target/ARM/ARM.h =================================================================== --- lib/Target/ARM/ARM.h +++ lib/Target/ARM/ARM.h @@ -50,6 +50,8 @@ FunctionPass *createARMOptimizeBarriersPass(); FunctionPass *createThumb2SizeReductionPass( std::function Ftor = nullptr); +FunctionPass *createThumb1TailCallOptimizerPass(); + InstructionSelector * createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI, const ARMRegisterBankInfo &RBI); Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -69,6 +69,7 @@ CALL, // Function call. CALL_PRED, // Function call that's predicable. CALL_NOLINK, // Function call with branch not branch-and-link. + CALL_TAIL, // Function call that can be transformed to a tail call. BRCOND, // Conditional branch. BR_JT, // Jumptable branch. BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1244,6 +1244,7 @@ case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; case ARMISD::CALL: return "ARMISD::CALL"; + case ARMISD::CALL_TAIL: return "ARMISD::CALL_TAIL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; case ARMISD::BRCOND: return "ARMISD::BRCOND"; @@ -1787,6 +1788,7 @@ bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; bool isSibCall = false; + bool isThumb1TailCall = false; auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); // Disable tail calls if they're not supported. @@ -1798,9 +1800,18 @@ isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG); - if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) + if ((!isTailCall || Subtarget->isThumb1Only()) && + CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); + if (isTailCall && Subtarget->isThumb1Only()) { + // For Thumb1, we make a "tail call" which is sort of like an LLVM IR + // tail call; it's not a terminator, just a marker indicating that we + // could transform it to a terminator later. + ++NumTailCalls; + isThumb1TailCall = true; + isTailCall = false; + } // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. if (isTailCall) { @@ -2112,6 +2123,8 @@ if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; + else if (isThumb1TailCall) + CallOpc = ARMISD::CALL_TAIL; else CallOpc = ARMISD::CALL; } else { @@ -2299,12 +2312,29 @@ assert(Subtarget->supportsTailCall()); - // Tail calls to function pointers cannot be optimized for Thumb1 if the args - // to the call take up r0-r3. The reason is that there are no legal registers - // left to hold the pointer to the function to be called. - if (Subtarget->isThumb1Only() && Outs.size() >= 4 && - !isa(Callee.getNode())) + if (Subtarget->isThumb1Only()) { + // FIXME: This approximation isn't right for non-ELF targets. + if (!Subtarget->isTargetELF()) + return false; + bool IsImmediateCall = isa(Callee.getNode()) || + isa(Callee.getNode()); + + // Tail calls to function pointers cannot be optimized for Thumb1 if the + // args to the call take up r0-r3. The reason is that there are no legal + // registers left to hold the pointer to the function to be called. r12 is + // free, but it would be tricky to emit the right sequence because LLVM + // doesn't treat it as allocatable. + if (Outs.size() >= 4 && (!Subtarget->hasV8MBaselineOps() || + !IsImmediateCall)) + return false; + + // Don't try to emit a tail call on Thumb1 if the callee is a known global; + // we would be forced to load the address to a GPR. v8m is the exception: it + // supports the required immediate branch. (This restriction shouldn't be + // necessary for correctness; it's just a codesize optimization.) + if (!Subtarget->hasV8MBaselineOps() && IsImmediateCall) return false; + } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -2366,6 +2396,12 @@ CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); if (CCInfo.getNextStackOffset()) { + // On Thumb1, don't tail call functions which pass data on the stack; + // we need to perform the transform after isel, and the transform + // doesn't currently reason about the stack. + if (Subtarget->isThumb1Only()) + return false; + // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo &MFI = MF.getFrameInfo(); Index: lib/Target/ARM/ARMInstrInfo.td =================================================================== --- lib/Target/ARM/ARMInstrInfo.td +++ lib/Target/ARM/ARMInstrInfo.td @@ -131,6 +131,9 @@ def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; +def ARMcall_tail : SDNode<"ARMISD::CALL_TAIL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; Index: lib/Target/ARM/ARMInstrThumb.td =================================================================== --- lib/Target/ARM/ARMInstrThumb.td +++ lib/Target/ARM/ARMInstrThumb.td @@ -620,6 +620,17 @@ } } +let isCall = 1, Defs = [LR], Uses = [SP] in { + def tCALL_TAILr : tPseudoExpand<(outs), (ins tcGPR:$func, pred:$p), + 2, IIC_Br, [(ARMcall_tail tcGPR:$func)], + (tBLXr pred:$p, GPR:$func)>, + Requires<[IsThumb, HasV5T]>, Sched<[WriteBrL]>; + def tCALL_TAILi : tPseudoExpand<(outs), (ins thumb_bl_target:$func, pred:$p), + 2, IIC_Br, [(ARMcall_tail tglobaladdr:$func)], + (tBL pred:$p, thumb_bl_target:$func)>, + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]>; +} + // A8.6.218 Supervisor Call (Software Interrupt) // A8.6.16 B: Encoding T1 @@ -1550,6 +1561,9 @@ // Direct calls def : T1Pat<(ARMcall texternalsym:$func), (tBL texternalsym:$func)>, Requires<[IsThumb]>; +def : T1Pat<(ARMcall_tail texternalsym:$func), + (tCALL_TAILi texternalsym:$func)>, + Requires<[IsThumb]>; // zextload i1 -> zextload i8 def : T1Pat<(zextloadi1 t_addrmode_is1:$addr), Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -206,28 +206,10 @@ if (isTargetNaCl() || isAAPCS16_ABI()) stackAlignment = 16; - // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: - // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as - // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation - // support in the assembler and linker to be used. This would need to be - // fixed to fully support tail calls in Thumb1. - // - // For ARMv8-M, we /do/ implement tail calls. Doing this is tricky for v8-M - // baseline, since the LDM/POP instruction on Thumb doesn't take LR. This - // means if we need to reload LR, it takes extra instructions, which outweighs - // the value of the tail call; but here we don't know yet whether LR is going - // to be used. We take the optimistic approach of generating the tail call and - // perhaps taking a hit if we need to restore the LR. - - // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, - // but we need to make sure there are enough registers; the only valid - // registers are the 4 used for parameters. We don't currently do this - // case. - - SupportsTailCall = !isThumb() || hasV8MBaselineOps(); - if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0)) SupportsTailCall = false; + else + SupportsTailCall = true; switch (IT) { case DefaultIT: Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -470,6 +470,8 @@ if (!DisableA15SDOptimization) addPass(createA15SDOptimizerPass()); + + addPass(createThumb1TailCallOptimizerPass()); } } Index: lib/Target/ARM/CMakeLists.txt =================================================================== --- lib/Target/ARM/CMakeLists.txt +++ lib/Target/ARM/CMakeLists.txt @@ -51,6 +51,7 @@ MLxExpansionPass.cpp Thumb1FrameLowering.cpp Thumb1InstrInfo.cpp + Thumb1TailCallOptimizer.cpp ThumbRegisterInfo.cpp Thumb2ITBlockPass.cpp Thumb2InstrInfo.cpp Index: lib/Target/ARM/Thumb1TailCallOptimizer.cpp =================================================================== --- /dev/null +++ lib/Target/ARM/Thumb1TailCallOptimizer.cpp @@ -0,0 +1,129 @@ +//=== Thumb1TailCallOptimizer.cpp - Optimize Thumb1 tail calls-------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// On Thumb1, tail calls are restricted due to the limited instruction set. +// The key restriction that we need to deal with here, rather than during +// isel, is that restoring lr from the stack is expensive: it adds at least +// four bytes to the epilogue. So if we're optimizing for size, we must avoid +// avoid emitting any tail calls in a function that contains any non-tail +// call. +// +// This pass scans each Thumb1 function for non-tail calls; if it finds any, +// it does nothing. Otherwise, it takes each call with a tail marking, +// and transforms it to a branch. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "thumb1-tail-call-optimizer" + +namespace { + struct Thumb1TailCallOptimizer : public MachineFunctionPass { + static char ID; + Thumb1TailCallOptimizer() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { return "Thumb1 tail call optimizer"; } + }; + char Thumb1TailCallOptimizer::ID = 0; +} // end anonymous namespace + +bool Thumb1TailCallOptimizer::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const ARMSubtarget &STI = MF.getSubtarget(); + if (!STI.isThumb1Only() || !MF.getFrameInfo().hasCalls()) + return false; + + LLVM_DEBUG(dbgs() << "Tail call optimizer analyzing func: " << MF.getName() + << "\n"); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.hasCalls()) + return false; + + for (auto &MBB : MF) { + bool AfterCall = false; + for (auto &MI: MBB) { + // Make sure no unexpected instruction snuck in after a tail call. + if (AfterCall) { + if (MI.getOpcode() == ARM::COPY) { + // FIXME: Check that the COPY doesn't affect some unexpected + // register? Unlikely to matter. + } else if (MI.getOpcode() != ARM::ADJCALLSTACKUP && + MI.getOpcode() != ARM::tBX_RET) { + return false; + } + continue; + } + + if (MI.getOpcode() == ARM::tCALL_TAILr || + MI.getOpcode() == ARM::tCALL_TAILi) + AfterCall = true; + // Reject other calls because the transform would increase codesize. + else if (MI.definesRegister(ARM::LR)) + return false; + } + } + + bool Changed = false; + for (auto &MBB : MF) { + bool HasTailCall = false; + for (auto &MI: MBB) + HasTailCall |= MI.getOpcode() == ARM::tCALL_TAILr || + MI.getOpcode() == ARM::tCALL_TAILi; + if (!HasTailCall) + continue; + + const ARMBaseInstrInfo *TII = STI.getInstrInfo(); + auto MII = std::prev(MBB.end()); + while (MII->getOpcode() != ARM::tCALL_TAILr && + MII->getOpcode() != ARM::tCALL_TAILi) { + if (MII->getOpcode() == ARM::ADJCALLSTACKUP) { + --MII; + continue; + } + --MII; + std::next(MII)->eraseFromParent(); + } + + MachineInstr *CallInst = &*MII; + LLVM_DEBUG(dbgs() << "Replacing Thumb1 tail call\n" << *CallInst); + DebugLoc Loc = CallInst->getDebugLoc(); + if (CallInst->getOpcode() == ARM::tCALL_TAILr) { + unsigned DstReg = CallInst->getOperand(0).getReg(); + BuildMI(MBB, MBB.end(), Loc, TII->get(ARM::TCRETURNri)) + .addUse(DstReg, RegState::Kill); + } else { + BuildMI(MBB, MBB.end(), Loc, TII->get(ARM::TCRETURNdi)) + .add(CallInst->getOperand(0)); + } + auto NewMI = std::prev(MBB.end()); + + for (unsigned i = 3, e = CallInst->getNumOperands(); i != e; ++i) { + MachineOperand &Op = CallInst->getOperand(i); + if (Op.isReg() && !Op.isDef() && Op.getReg() != ARM::SP) + NewMI->addOperand(CallInst->getOperand(i)); + } + CallInst->eraseFromParent(); + Changed = true; + } + return Changed; +} + +FunctionPass *llvm::createThumb1TailCallOptimizerPass() { + return new Thumb1TailCallOptimizer(); +} + Index: test/CodeGen/ARM/thumb_indirect_calls.ll =================================================================== --- test/CodeGen/ARM/thumb_indirect_calls.ll +++ test/CodeGen/ARM/thumb_indirect_calls.ll @@ -4,7 +4,7 @@ @f = common global void (i32)* null, align 4 ; CHECK-LABEL: foo: -define void @foo(i32 %x) { +define void @foo(i32 %x) "disable-tail-calls"="true" { entry: %0 = load void (i32)*, void (i32)** @f, align 4 tail call void %0(i32 %x) @@ -22,7 +22,7 @@ } ; CHECK-LABEL: bar: -define void @bar(void (i32)* nocapture %g, i32 %x, void (i32)* nocapture %h) { +define void @bar(void (i32)* nocapture %g, i32 %x, void (i32)* nocapture %h) "disable-tail-calls"="true" { entry: tail call void %g(i32 %x) tail call void %h(i32 %x) Index: test/CodeGen/ARM/v8m-tail-call.ll =================================================================== --- test/CodeGen/ARM/v8m-tail-call.ll +++ test/CodeGen/ARM/v8m-tail-call.ll @@ -1,34 +1,63 @@ -; RUN: llc %s -o - -mtriple=thumbv8m.base | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -o - -mtriple=thumbv8m.base -verify-machineinstrs | FileCheck %s declare i32 @g(...) declare i32 @h0(i32, i32, i32, i32) define hidden i32 @f0() { +; CHECK-LABEL: f0: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bl g +; CHECK-NEXT: movs r1, #1 +; CHECK-NEXT: movs r2, #2 +; CHECK-NEXT: movs r3, #3 +; CHECK-NEXT: bl h0 +; CHECK-NEXT: pop {r7, pc} %1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %2 = tail call i32 @h0(i32 %1, i32 1, i32 2, i32 3) ret i32 %2 -; CHECK-LABEL: f0 -; CHECK: ldr [[POP:r[4567]]], [sp, #4] -; CHECK-NEXT: mov lr, [[POP]] -; CHECK-NEXT: pop {{.*}}[[POP]] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: b h0 } declare i32 @h1(i32) define hidden i32 @f1() { +; CHECK-LABEL: f1: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bl g +; CHECK-NEXT: bl h1 +; CHECK-NEXT: pop {r7, pc} %1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %2 = tail call i32 @h1(i32 %1) ret i32 %2 -; CHECK-LABEL: f1 -; CHECK: pop {r7} -; CHECK: pop {r1} -; CHECK: mov lr, r1 -; CHECK: b h1 } declare i32 @h2(i32, i32, i32, i32, i32) define hidden i32 @f2(i32, i32, i32, i32, i32) { +; CHECK-LABEL: f2: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl g +; CHECK-NEXT: cbz r0, .LBB2_2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: ldr r1, [sp, #24] +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: str r1, [r2] +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: bl h2 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: mvns r0, r0 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: pop {r4, r5, r6, pc} %6 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %7 = icmp eq i32 %6, 0 br i1 %7, label %10, label %8 @@ -38,12 +67,6 @@ %11 = phi i32 [ %9, %8 ], [ -1, %5 ] ret i32 %11 -; CHECK-LABEL: f2 -; CHECK: ldr [[POP:r[4567]]], [sp, #12] -; CHECK-NEXT: mov lr, [[POP]] -; CHECK-NEXT: pop {{.*}}[[POP]] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: b h2 } ; Make sure that tail calls to function pointers that require r0-r3 for argument @@ -51,7 +74,17 @@ @fnptr = global i32 (i32, i32, i32, i32)* null define i32 @test3() { ; CHECK-LABEL: test3: -; CHECK: blx {{r[0-9]+}} +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: movw r0, :lower16:fnptr +; CHECK-NEXT: movt r0, :upper16:fnptr +; CHECK-NEXT: ldr r4, [r0] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: movs r3, #4 +; CHECK-NEXT: blx r4 +; CHECK-NEXT: pop {r4, pc} %1 = load i32 (i32, i32, i32, i32)*, i32 (i32, i32, i32, i32)** @fnptr %2 = tail call i32 %1(i32 1, i32 2, i32 3, i32 4) ret i32 %2 @@ -60,7 +93,17 @@ @fnptr2 = global i32 (i32, i32, i64)* null define i32 @test4() { ; CHECK-LABEL: test4: -; CHECK: blx {{r[0-9]+}} +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: movw r0, :lower16:fnptr2 +; CHECK-NEXT: movt r0, :upper16:fnptr2 +; CHECK-NEXT: ldr r4, [r0] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: blx r4 +; CHECK-NEXT: pop {r4, pc} %1 = load i32 (i32, i32, i64)*, i32 (i32, i32, i64)** @fnptr2 %2 = tail call i32 %1(i32 1, i32 2, i64 3) ret i32 %2 @@ -72,9 +115,13 @@ @fnptr3 = global i32 (i32, i32)* null define i32 @test5() { ; CHECK-LABEL: test5: -; CHECK: ldr [[REG:r[0-9]+]] -; CHECK: bx [[REG]] -; CHECK-NOT: blx [[REG]] +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, :lower16:fnptr3 +; CHECK-NEXT: movt r0, :upper16:fnptr3 +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: bx r2 %1 = load i32 (i32, i32)*, i32 (i32, i32)** @fnptr3 %2 = tail call i32 %1(i32 1, i32 2) ret i32 %2 @@ -84,9 +131,14 @@ @fnptr4 = global i32 (i32, i64)* null define i32 @test6() { ; CHECK-LABEL: test6: -; CHECK: ldr [[REG:r[0-9]+]] -; CHECK: bx [[REG]] -; CHECK-NOT: blx [[REG]] +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, :lower16:fnptr4 +; CHECK-NEXT: movt r0, :upper16:fnptr4 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r2, #2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: bx r1 %1 = load i32 (i32, i64)*, i32 (i32, i64)** @fnptr4 %2 = tail call i32 %1(i32 1, i64 2) ret i32 %2 @@ -96,8 +148,12 @@ ; tail-call optimized. define i32 @test7() { ; CHECK-LABEL: test7: -; CHECK: b bar -; CHECK-NOT: bl bar +; CHECK: @ %bb.0: +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: movs r3, #4 +; CHECK-NEXT: b bar %tail = tail call i32 @bar(i32 1, i32 2, i32 3, i32 4) ret i32 %tail } @@ -109,6 +165,30 @@ %struct.S = type { i32 } define void @test8(i32 (i32, i32, i32)* nocapture %fn, i32 %x) local_unnamed_addr { +; CHECK-LABEL: test8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: bl test8_u +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: ldr r6, [r0] +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl test8_h +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: bl test8_g +; CHECK-NEXT: str r4, [r5] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload +; CHECK-NEXT: blx r3 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %call = tail call %struct.S* bitcast (%struct.S* (...)* @test8_u to %struct.S* ()*)() %a = getelementptr inbounds %struct.S, %struct.S* %call, i32 0, i32 0 @@ -125,6 +205,3 @@ declare i32 @test8_g(i32, i32, i32) declare i32 @test8_h(i32) -; CHECK: str r0, [sp] @ 4-byte Spill -; CHECK: ldr r3, [sp] @ 4-byte Reload -; CHECK: bx r3