Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -771,7 +771,8 @@ const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, - SelectionDAG& DAG) const; + SelectionDAG& DAG, + const bool PreferIndirect) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1836,16 +1836,29 @@ bool isThisReturn = false; bool isSibCall = false; auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); + bool PreferIndirect = false; // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") isTailCall = false; + if (isa(Callee)) { + // If we're optimizing for minimum size and the function is called three or + // more times in this block, we can improve codesize by calling indirectly + // as BLXr has a 16-bit encoding. + auto *GV = cast(Callee)->getGlobal(); + auto *BB = CLI.CS.getParent(); + PreferIndirect = + Subtarget->isThumb() && Subtarget->hasMinSize() && + count_if(GV->users(), [&BB](const User *U) { + return isa(U) && cast(U)->getParent() == BB; + }) > 2; + } if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), - Outs, OutVals, Ins, DAG); + Outs, OutVals, Ins, DAG, PreferIndirect); if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -1872,7 +1885,7 @@ // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - if (!isSibCall) + else Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue StackPtr = @@ -2067,17 +2080,6 @@ MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } else if (isa(Callee)) { - // If we're optimizing for minimum size and the function is called three or - // more times in this block, we can improve codesize by calling indirectly - // as BLXr has a 16-bit encoding. - auto *GV = cast(Callee)->getGlobal(); - auto *BB = CLI.CS.getParent(); - bool PreferIndirect = - Subtarget->isThumb() && Subtarget->hasMinSize() && - count_if(GV->users(), [&BB](const User *U) { - return isa(U) && cast(U)->getParent() == BB; - }) > 2; - if (!PreferIndirect) { isDirect = true; bool isDef = GV->isStrongDefinitionForLinker(); @@ -2318,18 +2320,19 @@ const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, - SelectionDAG& DAG) const { + SelectionDAG& DAG, + const bool PreferIndirect) const { MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); assert(Subtarget->supportsTailCall()); - // Tail calls to function pointers cannot be optimized for Thumb1 if the args + // Indirect tail calls cannot be optimized for Thumb1 if the args // to the call take up r0-r3. The reason is that there are no legal registers // left to hold the pointer to the function to be called. if (Subtarget->isThumb1Only() && Outs.size() >= 4 && - !isa(Callee.getNode())) + (!isa(Callee.getNode()) || PreferIndirect)) return false; // Look for obvious safe cases to perform tail call optimization that do not Index: test/CodeGen/ARM/pr42062.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/pr42062.ll @@ -0,0 +1,15 @@ +; RUN: llc -o %t0.s %s 2>&1 | FileCheck --allow-empty %s +target triple = "thumbv8m.base-arm-none-eabi" +@foo = external global i8 +declare i32 @bar(i8* nocapture, i32, i32, i8* nocapture) + +define void @food(i8* %a) #0 { +; CHECK-NOT: error: ran out of registers during register allocation +entry: + %0 = tail call i32 @bar(i8* %a, i32 8, i32 1, i8* nonnull @foo) + %1 = tail call i32 @bar(i8* %a, i32 9, i32 0, i8* nonnull @foo) + %2 = tail call i32 @bar(i8* %a, i32 7, i32 2, i8* nonnull @foo) + ret void +} +attributes #0 = { minsize "target-cpu"="cortex-m23" } +