Index: llvm/docs/BitCodeFormat.rst =================================================================== --- llvm/docs/BitCodeFormat.rst +++ llvm/docs/BitCodeFormat.rst @@ -795,6 +795,8 @@ * ``swiftcc`` : code 16 * ``cxx_fast_tlscc``: code 17 * ``tailcc`` : code 18 + * ``cfguard_checkcc`` : code 19 + * ``swifttailcc`` : code 20 * ``x86_stdcallcc``: code 64 * ``x86_fastcallcc``: code 65 * ``arm_apcscc``: code 66 Index: llvm/docs/CodeGenerator.rst =================================================================== --- llvm/docs/CodeGenerator.rst +++ llvm/docs/CodeGenerator.rst @@ -2064,11 +2064,12 @@ ---------------------- Tail call optimization, callee reusing the stack of the caller, is currently -supported on x86/x86-64, PowerPC, and WebAssembly. It is performed on x86/x86-64 -and PowerPC if: +supported on x86/x86-64, PowerPC, AArch64, and WebAssembly. It is performed on +x86/x86-64, PowerPC, and AArch64 if: * Caller and callee have the calling convention ``fastcc``, ``cc 10`` (GHC - calling convention), ``cc 11`` (HiPE calling convention), or ``tailcc``. + calling convention), ``cc 11`` (HiPE calling convention), ``tailcc``, or + ``swifttailcc``. * The call is a tail call - in tail position (ret immediately follows call and ret uses value of call or is void). @@ -2102,6 +2103,10 @@ * The caller and callee's return types must match. The caller cannot be void unless the callee is, too. +AArch64 constraints: + +* No variable argument lists are used. + Example: Call as ``llc -tailcallopt test.ll``. Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -432,10 +432,6 @@ - On X86-64 the callee preserves all general purpose registers, except for RDI and RAX. -"``swiftcc``" - This calling convention is used for Swift language. - - On X86-64 RCX and R8 are available for additional integer returns, and - XMM2 and XMM3 are available for additional FP/vector returns. - - On iOS platforms, we use AAPCS-VFP calling convention. "``tailcc``" - Tail callable calling convention This calling convention ensures that calls in tail position will always be tail call optimized. This calling convention is equivalent to fastcc, @@ -444,6 +440,14 @@ the GHC or the HiPE convention is used. `_ This calling convention does not support varargs and requires the prototype of all callees to exactly match the prototype of the function definition. +"``swiftcc``" - This calling convention is used for Swift language. + - On X86-64 RCX and R8 are available for additional integer returns, and + XMM2 and XMM3 are available for additional FP/vector returns. + - On iOS platforms, we use AAPCS-VFP calling convention. +"``swifttailcc``" + This calling convention is like ``swiftcc`` in most respects, but also the + callee pops the argument area of the stack so that mandatory tail calls are + possible as in ``tailcc``. "``cfguard_checkcc``" - Windows Control Flow Guard (Check mechanism) This calling convention is used for the Control Flow Guard check function, calls to which can be inserted before indirect calls to check that the call Index: llvm/include/llvm/IR/CallingConv.h =================================================================== --- llvm/include/llvm/IR/CallingConv.h +++ llvm/include/llvm/IR/CallingConv.h @@ -86,6 +86,11 @@ /// and has no return value. All register values are preserved. CFGuard_Check = 19, + /// SwiftTail - This follows the Swift calling convention in how arguments + /// are passed but guarantees tail calls will be made by making the callee + /// clean up their stack. + SwiftTail = 20, + // Target - This is the start of the target-specific calling conventions, // e.g. fastcall and thiscall on X86. FirstTargetCC = 64, Index: llvm/lib/AsmParser/LLLexer.cpp =================================================================== --- llvm/lib/AsmParser/LLLexer.cpp +++ llvm/lib/AsmParser/LLLexer.cpp @@ -609,6 +609,7 @@ KEYWORD(x86_regcallcc); KEYWORD(webkit_jscc); KEYWORD(swiftcc); + KEYWORD(swifttailcc); KEYWORD(anyregcc); KEYWORD(preserve_mostcc); KEYWORD(preserve_allcc); Index: llvm/lib/AsmParser/LLParser.cpp =================================================================== --- llvm/lib/AsmParser/LLParser.cpp +++ llvm/lib/AsmParser/LLParser.cpp @@ -2105,6 +2105,7 @@ /// ::= 'preserve_allcc' /// ::= 'ghccc' /// ::= 'swiftcc' +/// ::= 'swifttailcc' /// ::= 'x86_intrcc' /// ::= 'hhvmcc' /// ::= 'hhvm_ccc' @@ -2155,6 +2156,7 @@ case lltok::kw_preserve_allcc: CC = CallingConv::PreserveAll; break; case lltok::kw_ghccc: CC = CallingConv::GHC; break; case lltok::kw_swiftcc: CC = CallingConv::Swift; break; + case lltok::kw_swifttailcc: CC = CallingConv::SwiftTail; break; case lltok::kw_x86_intrcc: CC = CallingConv::X86_INTR; break; case lltok::kw_hhvmcc: CC = CallingConv::HHVM; break; case lltok::kw_hhvm_ccc: CC = CallingConv::HHVM_C; break; Index: llvm/lib/AsmParser/LLToken.h =================================================================== --- llvm/lib/AsmParser/LLToken.h +++ llvm/lib/AsmParser/LLToken.h @@ -156,6 +156,7 @@ kw_webkit_jscc, kw_anyregcc, kw_swiftcc, + kw_swifttailcc, kw_preserve_mostcc, kw_preserve_allcc, kw_ghccc, Index: llvm/lib/CodeGen/Analysis.cpp =================================================================== --- llvm/lib/CodeGen/Analysis.cpp +++ llvm/lib/CodeGen/Analysis.cpp @@ -511,9 +511,10 @@ // not profitable. Also, if the callee is a special function (e.g. // longjmp on x86), it can end up causing miscompilation that has not // been fully understood. - if (!Ret && - ((!TM.Options.GuaranteedTailCallOpt && - Call.getCallingConv() != CallingConv::Tail) || !isa(Term))) + if (!Ret && ((!TM.Options.GuaranteedTailCallOpt && + Call.getCallingConv() != CallingConv::Tail && + Call.getCallingConv() != CallingConv::SwiftTail) || + !isa(Term))) return false; // If I will have a chain, make sure no other instruction that will have a Index: llvm/lib/IR/AsmWriter.cpp =================================================================== --- llvm/lib/IR/AsmWriter.cpp +++ llvm/lib/IR/AsmWriter.cpp @@ -388,6 +388,7 @@ case CallingConv::SPIR_FUNC: Out << "spir_func"; break; case CallingConv::SPIR_KERNEL: Out << "spir_kernel"; break; case CallingConv::Swift: Out << "swiftcc"; break; + case CallingConv::SwiftTail: Out << "swifttailcc"; break; case CallingConv::X86_INTR: Out << "x86_intrcc"; break; case CallingConv::HHVM: Out << "hhvmcc"; break; case CallingConv::HHVM_C: Out << "hhvm_ccc"; break; Index: llvm/lib/Target/AArch64/AArch64CallingConvention.td =================================================================== --- llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -420,6 +420,9 @@ X19, X20, X21, X22, X23, X24, X25, X26, X27, X28, LR, FP)>; +def CSR_AArch64_AAPCS_SwiftTail + : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X20, X22)>; + // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since // 'this' and the pointer return value are both passed in X0 in these cases, // this can be partially modelled by treating X0 as a callee-saved register; @@ -472,6 +475,9 @@ def CSR_Darwin_AArch64_AAPCS_SwiftError : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; +def CSR_Darwin_AArch64_AAPCS_SwiftTail + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X20, X22)>; + // The function used by Darwin to obtain the address of a thread-local variable // guarantees more than a normal AAPCS function. x16 and x17 are used on the // fast path for calculation, but other registers except X0 (argument/return) Index: llvm/lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -193,9 +193,13 @@ STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); -/// Returns the argument pop size. -static uint64_t getArgumentPopSize(MachineFunction &MF, - MachineBasicBlock &MBB) { +// Returns how much of the incoming argument stack area we should clean up in an +// epilogue. For the C calling convention this will be 0, for guaranteed tail +// call conventions it can be positive (a normal return or a tail call to a +// function that uses less stack space for arguments) or negative (for a tail +// call to a function that needs more stack space than us for arguments). +static int64_t getArgumentStackToRestore(MachineFunction &MF, + MachineBasicBlock &MBB) { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); bool IsTailCallReturn = false; if (MBB.end() != MBBI) { @@ -206,7 +210,7 @@ } AArch64FunctionInfo *AFI = MF.getInfo(); - uint64_t ArgumentPopSize = 0; + int64_t ArgumentPopSize = 0; if (IsTailCallReturn) { MachineOperand &StackAdjust = MBBI->getOperand(1); @@ -255,7 +259,7 @@ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF)) return false; - if (Exit && getArgumentPopSize(MF, *Exit)) + if (Exit && getArgumentStackToRestore(MF, *Exit)) return false; return true; @@ -311,10 +315,10 @@ const AArch64FunctionInfo *AFI, bool IsWin64, bool IsFunclet) { if (!IsWin64 || IsFunclet) { - // Only Win64 uses fixed objects, and then only for the function (not - // funclets) - return 0; + return AFI->getTailCallReservedStack(); } else { + assert(AFI->getTailCallReservedStack() == 0 && + "don't know how guaranteed tail calls might work on Win64"); // Var args are stored here in the primary function. const unsigned VarArgsArea = AFI->getVarArgsGPRSize(); // To support EH funclets we allocate an UnwindHelp object @@ -1667,9 +1671,9 @@ if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; - // Initial and residual are named for consistency with the prologue. Note that - // in the epilogue, the residual adjustment is executed first. - uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB); + // How much of the stack used by incoming arguments this function is expected + // to restore in this particular epilogue. + int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB); // The stack frame should be like below, // @@ -1704,7 +1708,7 @@ Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); - uint64_t AfterCSRPopSize = ArgumentPopSize; + int64_t AfterCSRPopSize = ArgumentStackToRestore; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // We cannot rely on the local stack size set in emitPrologue if the function // has funclets, as funclets have different local stack size requirements, and @@ -1741,8 +1745,10 @@ // Converting the last ldp to a post-index ldp is valid only if the last // ldp's offset is 0. const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1); - // If the offset is 0, convert it to a post-index ldp. - if (OffsetOp.getImm() == 0) + // If the offset is 0 and the AfterCSR pop is not actually trying to + // allocate more stack for arguments (in space that an untimely interrupt + // may clobber), convert it to a post-index ldp. + if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) convertCalleeSaveRestoreToSPPrePostIncDec( MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false); else { @@ -1913,6 +1919,8 @@ // assumes the SP is at the same location as it was after the callee-save save // code in the prologue. if (AfterCSRPopSize) { + assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an " + "interrupt may have clobbered"); // Find an insertion point for the first ldp so that it goes before the // shadow call stack epilog instruction. This ensures that the restore of // lr from x18 is placed after the restore from sp. @@ -1928,7 +1936,7 @@ adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed((int64_t)AfterCSRPopSize), TII, + StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } if (HasWinCFI) @@ -2146,7 +2154,8 @@ AttributeList Attrs = MF.getFunction().getAttributes(); return Subtarget.isTargetMachO() && !(Subtarget.getTargetLowering()->supportSwiftError() && - Attrs.hasAttrSomewhere(Attribute::SwiftError)); + Attrs.hasAttrSomewhere(Attribute::SwiftError)) && + MF.getFunction().getCallingConv() != CallingConv::SwiftTail; } static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, @@ -2260,6 +2269,7 @@ FirstReg = Count - 1; } int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); + bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); // When iterating backwards, the loop condition relies on unsigned wraparound. for (unsigned i = FirstReg; i < Count; i += RegInc) { @@ -2368,17 +2378,16 @@ // Round up size of non-pair to pair size if we need to pad the // callee-save area to ensure 16-byte alignment. - if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI && + if (NeedGapToAlignStack && !NeedsWinCFI && !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 && - !RPI.isPaired()) { + !RPI.isPaired() && ByteOffset % 16 != 0) { ByteOffset += 8 * StackFillDir; - assert(ByteOffset % 16 == 0); assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16)); // A stack frame with a gap looks like this, bottom up: // d9, d8. x21, gap, x20, x19. - // Set extra alignment on the x21 object (the only unpaired register) - // to create the gap above it. + // Set extra alignment on the x21 object to create the gap above it. MFI.setObjectAlignment(RPI.FrameIdx, Align(16)); + NeedGapToAlignStack = false; } int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4662,6 +4662,8 @@ case CallingConv::PreserveMost: case CallingConv::CXX_FAST_TLS: case CallingConv::Swift: + case CallingConv::SwiftTail: + case CallingConv::Tail: if (Subtarget->isTargetWindows() && IsVarArg) return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) @@ -5135,8 +5137,9 @@ } /// Return true if the calling convention is one that we can guarantee TCO for. -static bool canGuaranteeTCO(CallingConv::ID CC) { - return CC == CallingConv::Fast; +static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { + return (CC == CallingConv::Fast && GuaranteeTailCalls) || + CC == CallingConv::Tail || CC == CallingConv::SwiftTail; } /// Return true if we might ever do TCO for calls with this calling convention. @@ -5146,9 +5149,12 @@ case CallingConv::AArch64_SVE_VectorCall: case CallingConv::PreserveMost: case CallingConv::Swift: + case CallingConv::SwiftTail: + case CallingConv::Tail: + case CallingConv::Fast: return true; default: - return canGuaranteeTCO(CC); + return false; } } @@ -5200,8 +5206,8 @@ return false; } - if (getTargetMachine().Options.GuaranteedTailCallOpt) - return canGuaranteeTCO(CalleeCC) && CCMatch; + if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) + return CCMatch; // Externally-defined functions with weak linkage should not be // tail-called on AArch64 when the OS does not support dynamic @@ -5332,7 +5338,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const { - return CallCC == CallingConv::Fast && TailCallOpt; + return (CallCC == CallingConv::Fast && TailCallOpt) || + CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail; } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, @@ -5383,7 +5390,8 @@ // A sibling call is one where we're under the usual C ABI and not planning // to change that but can still do a tail call: - if (!TailCallOpt && IsTailCall) + if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && + CallConv != CallingConv::SwiftTail) IsSibCall = true; if (IsTailCall) @@ -5470,6 +5478,11 @@ // can actually shrink the stack. FPDiff = NumReusableBytes - NumBytes; + // Update the required reserved area if this is the tail call requiring the + // most argument stack space. + if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff) + FuncInfo->setTailCallReservedStack(-FPDiff); + // The stack pointer must be 16-byte aligned at all times it's used for a // memory operation, which in practice means at *all* times and in // particular across call boundaries. Therefore our own arguments started at @@ -5481,7 +5494,7 @@ // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); + Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); @@ -5738,7 +5751,7 @@ // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (IsTailCall && !IsSibCall) { - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), DAG.getIntPtrConstant(0, DL, true), InFlag, DL); InFlag = Chain.getValue(1); } Index: llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -54,6 +54,12 @@ /// callee is expected to pop the args. unsigned ArgumentStackToRestore = 0; + // Space just below incoming stack pointer reserved for arguments being passed + // on the stack during a tail call. This will be the difference between the + // largest tail call argument space needed in this function and what's already + // available by reusing space of incoming arguments. + unsigned TailCallReservedStack = 0; + /// HasStackFrame - True if this function has a stack frame. Set by /// determineCalleeSaves(). bool HasStackFrame = false; @@ -180,6 +186,11 @@ ArgumentStackToRestore = bytes; } + unsigned getTailCallReservedStack() const { return TailCallReservedStack; } + void setTailCallReservedStack(unsigned bytes) { + TailCallReservedStack = bytes; + } + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } void setStackSizeSVE(uint64_t S) { Index: llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -100,6 +100,8 @@ MF->getFunction().getAttributes().hasAttrSomewhere( Attribute::SwiftError)) return CSR_AArch64_AAPCS_SwiftError_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail) + return CSR_AArch64_AAPCS_SwiftTail_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) return CSR_AArch64_RT_MostRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::Win64) @@ -134,6 +136,8 @@ MF->getFunction().getAttributes().hasAttrSomewhere( Attribute::SwiftError)) return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail) + return CSR_Darwin_AArch64_AAPCS_SwiftTail_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) return CSR_Darwin_AArch64_RT_MostRegs_SaveList; return CSR_Darwin_AArch64_AAPCS_SaveList; @@ -199,6 +203,8 @@ ->supportSwiftError() && MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask; + if (CC == CallingConv::SwiftTail) + return CSR_Darwin_AArch64_AAPCS_SwiftTail_RegMask; if (CC == CallingConv::PreserveMost) return CSR_Darwin_AArch64_RT_MostRegs_RegMask; return CSR_Darwin_AArch64_AAPCS_RegMask; @@ -233,6 +239,11 @@ MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) return SCS ? CSR_AArch64_AAPCS_SwiftError_SCS_RegMask : CSR_AArch64_AAPCS_SwiftError_RegMask; + if (CC == CallingConv::SwiftTail) { + if (SCS) + report_fatal_error("ShadowCallStack attribute not supported with swifttail"); + return CSR_AArch64_AAPCS_SwiftTail_RegMask; + } if (CC == CallingConv::PreserveMost) return SCS ? CSR_AArch64_RT_MostRegs_SCS_RegMask : CSR_AArch64_RT_MostRegs_RegMask; Index: llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -264,7 +264,8 @@ } // namespace static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) { - return CallConv == CallingConv::Fast && TailCallOpt; + return (CallConv == CallingConv::Fast && TailCallOpt) || + CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail; } bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, @@ -534,8 +535,9 @@ } /// Return true if the calling convention is one that we can guarantee TCO for. -static bool canGuaranteeTCO(CallingConv::ID CC) { - return CC == CallingConv::Fast; +static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { + return (CC == CallingConv::Fast && GuaranteeTailCalls) || + CC == CallingConv::Tail || CC == CallingConv::SwiftTail; } /// Return true if we might ever do TCO for calls with this calling convention. @@ -544,9 +546,12 @@ case CallingConv::C: case CallingConv::PreserveMost: case CallingConv::Swift: + case CallingConv::SwiftTail: + case CallingConv::Tail: + case CallingConv::Fast: return true; default: - return canGuaranteeTCO(CC); + return false; } } @@ -731,8 +736,8 @@ } // If we have -tailcallopt, then we're done. - if (MF.getTarget().Options.GuaranteedTailCallOpt) - return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv(); + if (canGuaranteeTCO(CalleeCC, MF.getTarget().Options.GuaranteedTailCallOpt)) + return CalleeCC == CallerF.getCallingConv(); // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall). // Try to find cases where we can do that. @@ -803,7 +808,9 @@ AArch64FunctionInfo *FuncInfo = MF.getInfo(); // True when we're tail calling, but without -tailcallopt. - bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt; + bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt && + Info.CallConv != CallingConv::Tail && + Info.CallConv != CallingConv::SwiftTail; // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64 // register class. Until we can do that, we should fall back here. @@ -870,6 +877,11 @@ // actually shrink the stack. FPDiff = NumReusableBytes - NumBytes; + // Update the required reserved area if this is the tail call requiring the + // most argument stack space. + if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff) + FuncInfo->setTailCallReservedStack(-FPDiff); + // The stack pointer must be 16-byte aligned at all times it's used for a // memory operation, which in practice means at *all* times and in // particular across call boundaries. Therefore our own arguments started at @@ -913,12 +925,12 @@ // sequence start and end here. if (!IsSibCall) { MIB->getOperand(1).setImm(FPDiff); - CallSeqStart.addImm(NumBytes).addImm(0); + CallSeqStart.addImm(0).addImm(0); // End the call sequence *before* emitting the call. Normally, we would // tidy the frame up after the call. However, here, we've laid out the // parameters so that when SP is reset, they will be in the correct // location. - MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0); + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(0).addImm(0); } // Now we can add the actual call instruction to the correct basic block. Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2004,6 +2004,7 @@ return CallingConv::PreserveMost; case CallingConv::ARM_AAPCS_VFP: case CallingConv::Swift: + case CallingConv::SwiftTail: return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; case CallingConv::C: if (!Subtarget->isAAPCS_ABI()) Index: llvm/lib/Target/X86/X86CallingConv.td =================================================================== --- llvm/lib/Target/X86/X86CallingConv.td +++ llvm/lib/Target/X86/X86CallingConv.td @@ -462,6 +462,7 @@ // Handle Swift calls. CCIfCC<"CallingConv::Swift", CCDelegateTo>, + CCIfCC<"CallingConv::SwiftTail", CCDelegateTo>, // Handle explicit CC selection CCIfCC<"CallingConv::Win64", CCDelegateTo>, @@ -521,9 +522,11 @@ // normal functions don't need to save it somewhere. CCIfSwiftAsync>>, - // For Swift Calling Convention, pass sret in %rax. + // For Swift Calling Conventions, pass sret in %rax. CCIfCC<"CallingConv::Swift", CCIfSRet>>>, + CCIfCC<"CallingConv::SwiftTail", + CCIfSRet>>>, // Pointers are always passed in full 64-bit registers. CCIfPtr>, @@ -855,6 +858,10 @@ // The 'nest' parameter, if any, is passed in ECX. CCIfNest>, + // On swifttailcc pass swiftself in ECX. + CCIfCC<"CallingConv::SwiftTail", + CCIfSwiftSelf>>>, + // The first 3 integer arguments, if marked 'inreg' and if the call is not // a vararg call, are passed in integer registers. CCIfNotVarArg>>>, @@ -1084,6 +1091,7 @@ def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>; def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>; +def CSR_64_SwiftTail : CalleeSavedRegs<(sub CSR_64, R13, R14)>; def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>; def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; @@ -1094,6 +1102,7 @@ (sequence "XMM%u", 6, 15))>; def CSR_Win64_SwiftError : CalleeSavedRegs<(sub CSR_Win64, R12)>; +def CSR_Win64_SwiftTail : CalleeSavedRegs<(sub CSR_Win64, R13, R14)>; // The function used by Darwin to obtain the address of a thread-local variable // uses rdi to pass a single parameter and rax for the return value. All other Index: llvm/lib/Target/X86/X86FastISel.cpp =================================================================== --- llvm/lib/Target/X86/X86FastISel.cpp +++ llvm/lib/Target/X86/X86FastISel.cpp @@ -1183,6 +1183,7 @@ if (CC != CallingConv::C && CC != CallingConv::Fast && CC != CallingConv::Tail && + CC != CallingConv::SwiftTail && CC != CallingConv::X86_FastCall && CC != CallingConv::X86_StdCall && CC != CallingConv::X86_ThisCall && @@ -1197,7 +1198,7 @@ // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || - CC == CallingConv::Tail) + CC == CallingConv::Tail || CC == CallingConv::SwiftTail) return false; // Let SDISel handle vararg functions. @@ -1285,7 +1286,8 @@ // the sret argument into %rax/%eax (depending on ABI) for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. - if (F.hasStructRetAttr() && CC != CallingConv::Swift) { + if (F.hasStructRetAttr() && CC != CallingConv::Swift && + CC != CallingConv::SwiftTail) { Register Reg = X86MFInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); @@ -3143,7 +3145,8 @@ if (Subtarget->getTargetTriple().isOSMSVCRT()) return 0; if (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE || CC == CallingConv::Tail) + CC == CallingConv::HiPE || CC == CallingConv::Tail || + CC == CallingConv::SwiftTail) return 0; if (CB) @@ -3195,6 +3198,7 @@ case CallingConv::Tail: case CallingConv::WebKit_JS: case CallingConv::Swift: + case CallingConv::SwiftTail: case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: case CallingConv::X86_ThisCall: @@ -3211,7 +3215,7 @@ // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || - CC == CallingConv::Tail) + CC == CallingConv::Tail || CC == CallingConv::SwiftTail) return false; // Don't know how to handle Win64 varargs yet. Nothing special needed for Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3175,7 +3175,8 @@ static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || - CC == CallingConv::HHVM || CC == CallingConv::Tail); + CC == CallingConv::HHVM || CC == CallingConv::Tail || + CC == CallingConv::SwiftTail); } /// Return true if we might ever do TCO for calls with this calling convention. @@ -3201,7 +3202,8 @@ /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { - return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail; + return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || + CC == CallingConv::Tail || CC == CallingConv::SwiftTail; } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { @@ -3752,7 +3754,7 @@ // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. - if (CallConv == CallingConv::Swift) + if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail) continue; // All x86 ABIs require that for returning structs by value we copy the @@ -3917,7 +3919,7 @@ StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || - CallConv == CallingConv::Tail; + CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail; X86MachineFunctionInfo *X86Info = MF.getInfo(); bool HasNCSR = (CB && isa(CB) && CB->hasFnAttr("no_caller_saved_registers")); @@ -4627,7 +4629,7 @@ bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || - CalleeCC == CallingConv::Tail; + CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail; // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this @@ -26892,6 +26894,7 @@ case CallingConv::X86_ThisCall: case CallingConv::Fast: case CallingConv::Tail: + case CallingConv::SwiftTail: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; Index: llvm/lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86RegisterInfo.cpp +++ llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -354,6 +354,10 @@ if (!HasSSE) return CSR_Win64_NoSSE_SaveList; return CSR_Win64_SaveList; + case CallingConv::SwiftTail: + if (!Is64Bit) + return CSR_32_SaveList; + return IsWin64 ? CSR_Win64_SwiftTail_SaveList : CSR_64_SwiftTail_SaveList; case CallingConv::X86_64_SysV: if (CallsEHReturn) return CSR_64EHRet_SaveList; @@ -470,6 +474,10 @@ break; case CallingConv::Win64: return CSR_Win64_RegMask; + case CallingConv::SwiftTail: + if (!Is64Bit) + return CSR_32_RegMask; + return IsWin64 ? CSR_Win64_SwiftTail_RegMask : CSR_64_SwiftTail_RegMask; case CallingConv::X86_64_SysV: return CSR_64_RegMask; case CallingConv::X86_INTR: @@ -502,6 +510,7 @@ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError); if (IsSwiftCC) return IsWin64 ? CSR_Win64_SwiftError_RegMask : CSR_64_SwiftError_RegMask; + return IsWin64 ? CSR_Win64_RegMask : CSR_64_RegMask; } Index: llvm/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/lib/Target/X86/X86Subtarget.h +++ llvm/lib/Target/X86/X86Subtarget.h @@ -885,6 +885,7 @@ case CallingConv::Fast: case CallingConv::Tail: case CallingConv::Swift: + case CallingConv::SwiftTail: case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: case CallingConv::X86_ThisCall: Index: llvm/test/Bitcode/compatibility.ll =================================================================== --- llvm/test/Bitcode/compatibility.ll +++ llvm/test/Bitcode/compatibility.ll @@ -382,6 +382,8 @@ ; CHECK: declare preserve_mostcc void @f.preserve_mostcc() declare preserve_allcc void @f.preserve_allcc() ; CHECK: declare preserve_allcc void @f.preserve_allcc() +declare swifttailcc void @f.swifttailcc() +; CHECK: declare swifttailcc void @f.swifttailcc() declare cc64 void @f.cc64() ; CHECK: declare x86_stdcallcc void @f.cc64() declare x86_stdcallcc void @f.x86_stdcallcc() Index: llvm/test/CodeGen/AArch64/swift-async.ll =================================================================== --- llvm/test/CodeGen/AArch64/swift-async.ll +++ llvm/test/CodeGen/AArch64/swift-async.ll @@ -5,7 +5,7 @@ ; Important details in prologue: ; * x22 is stored just below x29 ; * Enough stack space is allocated for everything -define void @simple(i8* swiftasync %ctx) "frame-pointer"="all" { +define swifttailcc void @simple(i8* swiftasync %ctx) "frame-pointer"="all" { ; CHECK-LABEL: simple: ; CHECK: orr x29, x29, #0x100000000000000 ; CHECK: sub sp, sp, #32 @@ -32,21 +32,20 @@ ret void } -define void @more_csrs(i8* swiftasync %ctx) "frame-pointer"="all" { +define swifttailcc void @more_csrs(i8* swiftasync %ctx) "frame-pointer"="all" { ; CHECK-LABEL: more_csrs: ; CHECK: orr x29, x29, #0x100000000000000 -; CHECK: sub sp, sp, #48 -; CHECK: stp x24, x23, [sp, #8] -; CHECK: stp x29, x30, [sp, #32] +; CHECK: str x23, [sp, #-32]! +; CHECK: stp x29, x30, [sp, #16] -; CHECK-NOAUTH: str x22, [sp, #24] -; CHECK-AUTH: add x16, sp, #24 +; CHECK-NOAUTH: str x22, [sp, #8] +; CHECK-AUTH: add x16, sp, #8 ; CHECK-AUTH: movk x16, #49946, lsl #48 ; CHECK-AUTH: mov x17, x22 ; CHECK-AUTH: pacdb x17, x16 -; CHECK-AUTH: str x17, [sp, #24] +; CHECK-AUTH: str x17, [sp, #8] -; CHECK: add x29, sp, #32 +; CHECK: add x29, sp, #16 ; CHECK: .cfi_def_cfa w29, 16 ; CHECK: .cfi_offset w30, -8 ; CHECK: .cfi_offset w29, -16 @@ -54,15 +53,14 @@ ; [...] -; CHECK: ldp x29, x30, [sp, #32] -; CHECK: ldp x24, x23, [sp, #8] +; CHECK: ldp x29, x30, [sp, #16] +; CHECK: ldr x23, [sp], #32 ; CHECK: and x29, x29, #0xefffffffffffffff -; CHECK: add sp, sp, #48 call void asm sideeffect "", "~{x23}"() ret void } -define void @locals(i8* swiftasync %ctx) "frame-pointer"="all" { +define swifttailcc void @locals(i8* swiftasync %ctx) "frame-pointer"="all" { ; CHECK-LABEL: locals: ; CHECK: orr x29, x29, #0x100000000000000 ; CHECK: sub sp, sp, #64 @@ -93,7 +91,7 @@ ret void } -define void @use_input_context(i8* swiftasync %ctx, i8** %ptr) "frame-pointer"="all" { +define swifttailcc void @use_input_context(i8* swiftasync %ctx, i8** %ptr) "frame-pointer"="all" { ; CHECK-LABEL: use_input_context: ; CHECK-NOAUTH: str x22, [sp @@ -106,7 +104,7 @@ ret void } -define i8** @context_in_func() "frame-pointer"="non-leaf" { +define swifttailcc i8** @context_in_func() "frame-pointer"="non-leaf" { ; CHECK-LABEL: context_in_func: ; CHECK-NOAUTH: str xzr, [sp, #8] @@ -120,7 +118,7 @@ ret i8** %ptr } -define void @write_frame_context(i8* swiftasync %ctx, i8* %newctx) "frame-pointer"="non-leaf" { +define swifttailcc void @write_frame_context(i8* swiftasync %ctx, i8* %newctx) "frame-pointer"="non-leaf" { ; CHECK-LABEL: write_frame_context: ; CHECK: sub x[[ADDR:[0-9]+]], x29, #8 ; CHECK: str x0, [x[[ADDR]]] @@ -129,29 +127,48 @@ ret void } -define void @simple_fp_elim(i8* swiftasync %ctx) "frame-pointer"="non-leaf" { +define swifttailcc void @simple_fp_elim(i8* swiftasync %ctx) "frame-pointer"="non-leaf" { ; CHECK-LABEL: simple_fp_elim: ; CHECK-NOT: orr x29, x29, #0x100000000000000 ret void } -define void @large_frame(i8* swiftasync %ctx) "frame-pointer"="all" { +define swifttailcc void @large_frame(i8* swiftasync %ctx) "frame-pointer"="all" { ; CHECK-LABEL: large_frame: -; CHECK: sub sp, sp, #48 -; CHECK: stp x28, x27, [sp, #8] -; CHECK: stp x29, x30, [sp, #32] -; CHECK-NOAUTH: str x22, [sp, #24] -; CHECK: add x29, sp, #32 +; CHECK: str x28, [sp, #-32]! +; CHECK: stp x29, x30, [sp, #16] +; CHECK-NOAUTH: str x22, [sp, #8] +; CHECK: add x29, sp, #16 ; CHECK: sub sp, sp, #1024 ; [...] ; CHECK: add sp, sp, #1024 -; CHECK: ldp x29, x30, [sp, #32] -; CHECK: ldp x28, x27, [sp, #8] +; CHECK: ldp x29, x30, [sp, #16] +; CHECK: ldr x28, [sp], #32 ; CHECK: ret %var = alloca i8, i32 1024 ret void } -declare void @bar(i32*) +; Important point is that there is just one 8-byte gap in the CSR region (right +; now just above d8) to realign the stack. +define swifttailcc void @two_unpaired_csrs(i8* swiftasync) "frame-pointer"="all" { +; CHECK-LABEL: two_unpaired_csrs: +; CHECK: str d8, [sp, #-48]! +; CHECK: str x19, [sp, #16] +; CHECK: stp x29, x30, [sp, #32] +; CHECK-NOAUTH: str x22, [sp, #24] +; CHECK: add x29, sp, #32 + +; CHECK: .cfi_def_cfa w29, 16 +; CHECK: .cfi_offset w30, -8 +; CHECK: .cfi_offset w29, -16 +; CHECK: .cfi_offset w19, -32 +; CHECK: .cfi_offset b8, -48 + + call void asm "","~{x19},~{d8}"() + call swifttailcc void @bar(i32* undef) + ret void +} +declare swifttailcc void @bar(i32*) declare i8** @llvm.swift.async.context.addr() Index: llvm/test/CodeGen/AArch64/swifttail-async.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/swifttail-async.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=arm64-apple-ios %s -o - | FileCheck %s + + +declare swifttailcc void @swifttail_callee() +define swifttailcc void @swifttail() { +; CHECK-LABEL: swifttail: +; CHECK-NOT: ld{{.*}}x22 + call void asm "","~{x22}"() + tail call swifttailcc void @swifttail_callee() + ret void +} + +define swifttailcc void @no_preserve_swiftself() { +; CHECK-LABEL: no_preserve_swiftself: +; CHECK-NOT: ld{{.*}}x20 + call void asm "","~{x20}"() + ret void +} Index: llvm/test/CodeGen/AArch64/swifttail-call.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/swifttail-call.ll @@ -0,0 +1,230 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=SDAG,COMMON +; RUN: llc -global-isel -global-isel-abort=1 -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=GISEL,COMMON + +declare swifttailcc void @callee_stack0() +declare swifttailcc void @callee_stack8([8 x i64], i64) +declare swifttailcc void @callee_stack16([8 x i64], i64, i64) +declare extern_weak swifttailcc void @callee_weak() + +define swifttailcc void @caller_to0_from0() nounwind { +; COMMON-LABEL: caller_to0_from0: +; COMMON-NEXT: // %bb. + + tail call swifttailcc void @callee_stack0() + ret void + +; COMMON-NEXT: b callee_stack0 +} + +define swifttailcc void @caller_to0_from8([8 x i64], i64) { +; COMMON-LABEL: caller_to0_from8: + + tail call swifttailcc void @callee_stack0() + ret void + +; COMMON: add sp, sp, #16 +; COMMON-NEXT: b callee_stack0 +} + +define swifttailcc void @caller_to8_from0() { +; COMMON-LABEL: caller_to8_from0: + +; Key point is that the "42" should go #16 below incoming stack +; pointer (we didn't have arg space to reuse). + tail call swifttailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void + +; COMMON: str {{x[0-9]+}}, [sp, #-16]! +; COMMON-NEXT: b callee_stack8 +} + +define swifttailcc void @caller_to8_from8([8 x i64], i64 %a) { +; COMMON-LABEL: caller_to8_from8: +; COMMON-NOT: sub sp, + +; Key point is that the "%a" should go where at SP on entry. + tail call swifttailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void + +; COMMON: str {{x[0-9]+}}, [sp] +; COMMON-NEXT: b callee_stack8 +} + +define swifttailcc void @caller_to16_from8([8 x i64], i64 %a) { +; COMMON-LABEL: caller_to16_from8: +; COMMON-NOT: sub sp, + +; Important point is that the call reuses the "dead" argument space +; above %a on the stack. If it tries to go below incoming-SP then the +; callee will not deallocate the space, even in swifttailcc. + tail call swifttailcc void @callee_stack16([8 x i64] undef, i64 42, i64 2) + +; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] +; COMMON-NEXT: b callee_stack16 + ret void +} + + +define swifttailcc void @caller_to8_from24([8 x i64], i64 %a, i64 %b, i64 %c) { +; COMMON-LABEL: caller_to8_from24: +; COMMON-NOT: sub sp, + +; Key point is that the "%a" should go where at #16 above SP on entry. + tail call swifttailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void + +; COMMON: str {{x[0-9]+}}, [sp, #16]! +; COMMON-NEXT: b callee_stack8 +} + + +define swifttailcc void @caller_to16_from16([8 x i64], i64 %a, i64 %b) { +; COMMON-LABEL: caller_to16_from16: +; COMMON-NOT: sub sp, + +; Here we want to make sure that both loads happen before the stores: +; otherwise either %a or %b will be wrongly clobbered. + tail call swifttailcc void @callee_stack16([8 x i64] undef, i64 %b, i64 %a) + ret void + +; COMMON: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp] +; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] +; COMMON-NEXT: b callee_stack16 +} + +define swifttailcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" { +; COMMON-LABEL: disable_tail_calls: +; COMMON-NEXT: // %bb. + + tail call swifttailcc void @callee_stack0() + ret void + +; COMMON: bl callee_stack0 +; COMMON: ret +} + +; Weakly-referenced extern functions cannot be tail-called, as AAELF does +; not define the behaviour of branch instructions to undefined weak symbols. +define swifttailcc void @caller_weak() { +; COMMON-LABEL: caller_weak: +; COMMON: bl callee_weak + tail call void @callee_weak() + ret void +} + +declare { [2 x float] } @get_vec2() + +define { [3 x float] } @test_add_elem() { +; SDAG-LABEL: test_add_elem: +; SDAG: bl get_vec2 +; SDAG: fmov s2, #1.0 +; SDAG: ret +; GISEL-LABEL: test_add_elem: +; GISEL: str x30, [sp, #-16]! +; GISEL: bl get_vec2 +; GISEL: fmov s2, #1.0 +; GISEL: ldr x30, [sp], #16 +; GISEL: ret + + %call = tail call { [2 x float] } @get_vec2() + %arr = extractvalue { [2 x float] } %call, 0 + %arr.0 = extractvalue [2 x float] %arr, 0 + %arr.1 = extractvalue [2 x float] %arr, 1 + + %res.0 = insertvalue { [3 x float] } undef, float %arr.0, 0, 0 + %res.01 = insertvalue { [3 x float] } %res.0, float %arr.1, 0, 1 + %res.012 = insertvalue { [3 x float] } %res.01, float 1.000000e+00, 0, 2 + ret { [3 x float] } %res.012 +} + +declare double @get_double() +define { double, [2 x double] } @test_mismatched_insert() { +; COMMON-LABEL: test_mismatched_insert: +; COMMON: bl get_double +; COMMON: bl get_double +; COMMON: bl get_double +; COMMON: ret + + %val0 = call double @get_double() + %val1 = call double @get_double() + %val2 = tail call double @get_double() + + %res.0 = insertvalue { double, [2 x double] } undef, double %val0, 0 + %res.01 = insertvalue { double, [2 x double] } %res.0, double %val1, 1, 0 + %res.012 = insertvalue { double, [2 x double] } %res.01, double %val2, 1, 1 + + ret { double, [2 x double] } %res.012 +} + +define void @fromC_totail() { +; COMMON-LABEL: fromC_totail: +; COMMON: sub sp, sp, #48 + +; COMMON-NOT: sub sp, +; COMMON: mov w[[TMP:[0-9]+]], #42 +; COMMON: str x[[TMP]], [sp] +; COMMON: bl callee_stack8 + ; We must reset the stack to where it was before the call by undoing its extra stack pop. +; COMMON: str x[[TMP]], [sp, #-16]! +; COMMON: bl callee_stack8 +; COMMON: sub sp, sp, #16 + + call swifttailcc void @callee_stack8([8 x i64] undef, i64 42) + call swifttailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void +} + +define void @fromC_totail_noreservedframe(i32 %len) { +; COMMON-LABEL: fromC_totail_noreservedframe: +; COMMON: stp x29, x30, [sp, #-48]! + +; COMMON: mov w[[TMP:[0-9]+]], #42 + ; Note stack is subtracted here to allocate space for arg +; COMMON: str x[[TMP]], [sp, #-16]! +; COMMON: bl callee_stack8 + ; And here. +; COMMON: str x[[TMP]], [sp, #-16]! +; COMMON: bl callee_stack8 + ; But not restored here because callee_stack8 did that for us. +; COMMON-NOT: sub sp, + + ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs. + %var = alloca i32, i32 %len + + call swifttailcc void @callee_stack8([8 x i64] undef, i64 42) + call swifttailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void +} + +declare void @Ccallee_stack8([8 x i64], i64) + +define swifttailcc void @fromtail_toC() { +; COMMON-LABEL: fromtail_toC: +; COMMON: sub sp, sp, #32 + +; COMMON-NOT: sub sp, +; COMMON: mov w[[TMP:[0-9]+]], #42 +; COMMON: str x[[TMP]], [sp] +; COMMON: bl Ccallee_stack8 + ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything. +; COMMON-NOT: add sp, +; COMMON-NOT: sub sp, +; COMMON: str x[[TMP]], [sp]{{$}} +; COMMON: bl Ccallee_stack8 +; COMMON-NOT: sub sp, + + + call void @Ccallee_stack8([8 x i64] undef, i64 42) + call void @Ccallee_stack8([8 x i64] undef, i64 42) + ret void +} + +declare swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure) +define swiftcc i8* @CallSwiftSelf(i8* swiftself %closure, i8* %context) { +; CHECK-LABEL: CallSwiftSelf: +; CHECK: stp x20 + ;call void asm "","~{r13}"() ; We get a push r13 but why not with the call + ; below? + %res = call swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure) + ret i8* %res +} Index: llvm/test/CodeGen/AArch64/tail-call.ll =================================================================== --- llvm/test/CodeGen/AArch64/tail-call.ll +++ llvm/test/CodeGen/AArch64/tail-call.ll @@ -28,39 +28,38 @@ define fastcc void @caller_to8_from0() { ; COMMON-LABEL: caller_to8_from0: -; COMMON: sub sp, sp, #32 ; Key point is that the "42" should go #16 below incoming stack ; pointer (we didn't have arg space to reuse). tail call fastcc void @callee_stack8([8 x i64] undef, i64 42) ret void -; COMMON: str {{x[0-9]+}}, [sp, #16]! +; COMMON: str {{x[0-9]+}}, [sp, #-16]! ; COMMON-NEXT: b callee_stack8 } define fastcc void @caller_to8_from8([8 x i64], i64 %a) { ; COMMON-LABEL: caller_to8_from8: -; COMMON: sub sp, sp, #16 +; COMMON-NOT: sub sp, ; Key point is that the "%a" should go where at SP on entry. tail call fastcc void @callee_stack8([8 x i64] undef, i64 42) ret void -; COMMON: str {{x[0-9]+}}, [sp, #16]! +; COMMON: str {{x[0-9]+}}, [sp] ; COMMON-NEXT: b callee_stack8 } define fastcc void @caller_to16_from8([8 x i64], i64 %a) { ; COMMON-LABEL: caller_to16_from8: -; COMMON: sub sp, sp, #16 +; COMMON-NOT: sub sp, ; Important point is that the call reuses the "dead" argument space ; above %a on the stack. If it tries to go below incoming-SP then the ; callee will not deallocate the space, even in fastcc. tail call fastcc void @callee_stack16([8 x i64] undef, i64 42, i64 2) -; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]! +; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] ; COMMON-NEXT: b callee_stack16 ret void } @@ -68,28 +67,28 @@ define fastcc void @caller_to8_from24([8 x i64], i64 %a, i64 %b, i64 %c) { ; COMMON-LABEL: caller_to8_from24: -; COMMON: sub sp, sp, #16 +; COMMON-NOT: sub sp, ; Key point is that the "%a" should go where at #16 above SP on entry. tail call fastcc void @callee_stack8([8 x i64] undef, i64 42) ret void -; COMMON: str {{x[0-9]+}}, [sp, #32]! +; COMMON: str {{x[0-9]+}}, [sp, #16]! ; COMMON-NEXT: b callee_stack8 } define fastcc void @caller_to16_from16([8 x i64], i64 %a, i64 %b) { ; COMMON-LABEL: caller_to16_from16: -; COMMON: sub sp, sp, #16 +; COMMON-NOT: sub sp, ; Here we want to make sure that both loads happen before the stores: ; otherwise either %a or %b will be wrongly clobbered. tail call fastcc void @callee_stack16([8 x i64] undef, i64 %b, i64 %a) ret void -; COMMON: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] -; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]! +; COMMON: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp] +; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] ; COMMON-NEXT: b callee_stack16 } Index: llvm/test/CodeGen/AArch64/tailcc-tail-call.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/tailcc-tail-call.ll @@ -0,0 +1,225 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=SDAG,COMMON +; RUN: llc -global-isel -global-isel-abort=1 -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=GISEL,COMMON + +declare tailcc void @callee_stack0() +declare tailcc void @callee_stack8([8 x i64], i64) +declare tailcc void @callee_stack16([8 x i64], i64, i64) +declare extern_weak tailcc void @callee_weak() + +define tailcc void @caller_to0_from0() nounwind { +; COMMON-LABEL: caller_to0_from0: +; COMMON-NEXT: // %bb. + + tail call tailcc void @callee_stack0() + ret void + +; COMMON-NEXT: b callee_stack0 +} + +define tailcc void @caller_to0_from8([8 x i64], i64) { +; COMMON-LABEL: caller_to0_from8: + + tail call tailcc void @callee_stack0() + ret void + +; COMMON: add sp, sp, #16 +; COMMON-NEXT: b callee_stack0 +} + +define tailcc void @caller_to8_from0() "frame-pointer"="all"{ +; COMMON-LABEL: caller_to8_from0: + +; Key point is that the "42" should go #16 below incoming stack +; pointer (we didn't have arg space to reuse). + tail call tailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void + +; COMMON: str {{x[0-9]+}}, [x29, #16] +; COMMON: ldp x29, x30, [sp], #16 + ; If there is a sub here then the 42 will be briefly exposed to corruption + ; from an interrupt if the kernel does not honour a red-zone, and a larger + ; call could well overflow the red zone even if it is present. +; COMMON-NOT: sub sp, +; COMMON-NEXT: b callee_stack8 +} + +define tailcc void @caller_to8_from8([8 x i64], i64 %a) { +; COMMON-LABEL: caller_to8_from8: +; COMMON-NOT: sub sp, + +; Key point is that the "%a" should go where at SP on entry. + tail call tailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void + +; COMMON: str {{x[0-9]+}}, [sp] +; COMMON-NEXT: b callee_stack8 +} + +define tailcc void @caller_to16_from8([8 x i64], i64 %a) { +; COMMON-LABEL: caller_to16_from8: +; COMMON-NOT: sub sp, + +; Important point is that the call reuses the "dead" argument space +; above %a on the stack. If it tries to go below incoming-SP then the +; callee will not deallocate the space, even in tailcc. + tail call tailcc void @callee_stack16([8 x i64] undef, i64 42, i64 2) + +; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] +; COMMON-NEXT: b callee_stack16 + ret void +} + + +define tailcc void @caller_to8_from24([8 x i64], i64 %a, i64 %b, i64 %c) { +; COMMON-LABEL: caller_to8_from24: +; COMMON-NOT: sub sp, + +; Key point is that the "%a" should go where at #16 above SP on entry. + tail call tailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void + +; COMMON: str {{x[0-9]+}}, [sp, #16]! +; COMMON-NEXT: b callee_stack8 +} + + +define tailcc void @caller_to16_from16([8 x i64], i64 %a, i64 %b) { +; COMMON-LABEL: caller_to16_from16: +; COMMON-NOT: sub sp, + +; Here we want to make sure that both loads happen before the stores: +; otherwise either %a or %b will be wrongly clobbered. + tail call tailcc void @callee_stack16([8 x i64] undef, i64 %b, i64 %a) + ret void + +; COMMON: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp] +; COMMON: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] +; COMMON-NEXT: b callee_stack16 +} + +define tailcc void @disable_tail_calls() nounwind "disable-tail-calls"="true" { +; COMMON-LABEL: disable_tail_calls: +; COMMON-NEXT: // %bb. + + tail call tailcc void @callee_stack0() + ret void + +; COMMON: bl callee_stack0 +; COMMON: ret +} + +; Weakly-referenced extern functions cannot be tail-called, as AAELF does +; not define the behaviour of branch instructions to undefined weak symbols. +define tailcc void @caller_weak() { +; COMMON-LABEL: caller_weak: +; COMMON: bl callee_weak + tail call void @callee_weak() + ret void +} + +declare { [2 x float] } @get_vec2() + +define { [3 x float] } @test_add_elem() { +; SDAG-LABEL: test_add_elem: +; SDAG: bl get_vec2 +; SDAG: fmov s2, #1.0 +; SDAG: ret +; GISEL-LABEL: test_add_elem: +; GISEL: str x30, [sp, #-16]! +; GISEL: bl get_vec2 +; GISEL: fmov s2, #1.0 +; GISEL: ldr x30, [sp], #16 +; GISEL: ret + + %call = tail call { [2 x float] } @get_vec2() + %arr = extractvalue { [2 x float] } %call, 0 + %arr.0 = extractvalue [2 x float] %arr, 0 + %arr.1 = extractvalue [2 x float] %arr, 1 + + %res.0 = insertvalue { [3 x float] } undef, float %arr.0, 0, 0 + %res.01 = insertvalue { [3 x float] } %res.0, float %arr.1, 0, 1 + %res.012 = insertvalue { [3 x float] } %res.01, float 1.000000e+00, 0, 2 + ret { [3 x float] } %res.012 +} + +declare double @get_double() +define { double, [2 x double] } @test_mismatched_insert() { +; COMMON-LABEL: test_mismatched_insert: +; COMMON: bl get_double +; COMMON: bl get_double +; COMMON: bl get_double +; COMMON: ret + + %val0 = call double @get_double() + %val1 = call double @get_double() + %val2 = tail call double @get_double() + + %res.0 = insertvalue { double, [2 x double] } undef, double %val0, 0 + %res.01 = insertvalue { double, [2 x double] } %res.0, double %val1, 1, 0 + %res.012 = insertvalue { double, [2 x double] } %res.01, double %val2, 1, 1 + + ret { double, [2 x double] } %res.012 +} + +define void @fromC_totail() { +; COMMON-LABEL: fromC_totail: +; COMMON: sub sp, sp, #32 + +; COMMON-NOT: sub sp, +; COMMON: mov w[[TMP:[0-9]+]], #42 +; COMMON: str x[[TMP]], [sp] +; COMMON: bl callee_stack8 + ; We must reset the stack to where it was before the call by undoing its extra stack pop. +; COMMON: str x[[TMP]], [sp, #-16]! +; COMMON: bl callee_stack8 +; COMMON: sub sp, sp, #16 + + call tailcc void @callee_stack8([8 x i64] undef, i64 42) + call tailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void +} + +define void @fromC_totail_noreservedframe(i32 %len) { +; COMMON-LABEL: fromC_totail_noreservedframe: +; COMMON: stp x29, x30, [sp, #-32]! + +; COMMON: mov w[[TMP:[0-9]+]], #42 + ; Note stack is subtracted here to allocate space for arg +; COMMON: str x[[TMP]], [sp, #-16]! +; COMMON: bl callee_stack8 + ; And here. +; COMMON: str x[[TMP]], [sp, #-16]! +; COMMON: bl callee_stack8 + ; But not restored here because callee_stack8 did that for us. +; COMMON-NOT: sub sp, + + ; Variable sized allocation prevents reserving frame at start of function so each call must allocate any stack space it needs. + %var = alloca i32, i32 %len + + call tailcc void @callee_stack8([8 x i64] undef, i64 42) + call tailcc void @callee_stack8([8 x i64] undef, i64 42) + ret void +} + +declare void @Ccallee_stack8([8 x i64], i64) + +define tailcc void @fromtail_toC() { +; COMMON-LABEL: fromtail_toC: +; COMMON: sub sp, sp, #32 + +; COMMON-NOT: sub sp, +; COMMON: mov w[[TMP:[0-9]+]], #42 +; COMMON: str x[[TMP]], [sp] +; COMMON: bl Ccallee_stack8 + ; C callees will return with the stack exactly where we left it, so we mustn't try to fix anything. +; COMMON-NOT: add sp, +; COMMON-NOT: sub sp, +; COMMON: str x[[TMP]], [sp]{{$}} +; COMMON: bl Ccallee_stack8 +; COMMON-NOT: sub sp, + + + call void @Ccallee_stack8([8 x i64] undef, i64 42) + call void @Ccallee_stack8([8 x i64] undef, i64 42) + ret void +} Index: llvm/test/CodeGen/ARM/swifttailcc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/ARM/swifttailcc.ll @@ -0,0 +1,8 @@ +; RUN: llc -mtriple thumbv7k-apple-watchos %s -o - | FileCheck %s + +define float @verify_aapcs_vfp(float %in) { +; CHECK: vadd.f32 s0, s0, s0 + + %res = fadd float %in, %in + ret float %res +} Index: llvm/test/CodeGen/X86/swifttail-async-i386.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/swifttail-async-i386.ll @@ -0,0 +1,22 @@ +; RUN: llc -mtriple=i386-apple-darwin %s -o - | FileCheck %s + +declare void @clobber() + +declare swifttailcc void @swifttail_callee() +define swifttailcc void @swifttail() { +; CHECK-LABEL: swifttail: +; CHECK-NOT: %rbx + call void @clobber() + tail call swifttailcc void @swifttail_callee() + ret void +} + +declare swifttailcc void @swiftself(i8* swiftself) + +define swifttailcc void @swifttail2(i8* %arg) { +; CHECK-LABEL: swifttail2: +; CHECK: movl {{.*}}, %ecx +; CHECK: jmp _swiftself + tail call swifttailcc void @swiftself(i8* swiftself %arg) + ret void +} Index: llvm/test/CodeGen/X86/swifttail-async.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/swifttail-async.ll @@ -0,0 +1,28 @@ +; RUN: llc -mtriple=x86_64-apple-darwin %s -o - | FileCheck %s + + +declare swifttailcc void @swifttail_callee() +define swifttailcc void @swifttail() { +; CHECK-LABEL: swifttail: +; CHECK-NOT: popq %r14 + call void asm "","~{r14}"() + tail call swifttailcc void @swifttail_callee() + ret void +} + +define swifttailcc void @no_preserve_swiftself() { +; CHECK-LABEL: no_preserve_swiftself: +; CHECK-NOT: popq %r13 + call void asm "","~{r13}"() + ret void +} + +declare swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure) +define swiftcc i8* @CallSwiftSelf(i8* swiftself %closure, i8* %context) { +; CHECK-LABEL: CallSwiftSelf: +; CHECK: pushq %r13 + ;call void asm "","~{r13}"() ; We get a push r13 but why not with the call + ; below? + %res = call swifttailcc i8* @SwiftSelf(i8 * swiftasync %context, i8* swiftself %closure) + ret i8* %res +} Index: llvm/test/CodeGen/X86/swifttail-return.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/swifttail-return.ll @@ -0,0 +1,29 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-unknown-unknown -O0 | FileCheck %s + +define swifttailcc [4 x i64] @return_int() { +; CHECK-LABEL: return_int: +; CHECK-DAG: movl $1, %eax +; CHECK-DAG: movl $2, %edx +; CHECK-DAG: movl $3, %ecx +; CHECK-DAG: movl $4, %r8d + + ret [4 x i64] [i64 1, i64 2, i64 3, i64 4] +} + + +; CHECK: [[ONE:.LCPI.*]]: +; CHECK-NEXT: # double 1 +; CHECK: [[TWO:.LCPI.*]]: +; CHECK-NEXT: # double 2 +; CHECK: [[THREE:.LCPI.*]]: +; CHECK-NEXT: # double 3 + +define swifttailcc [4 x double] @return_float() { +; CHECK-LABEL: return_float: +; CHECK-DAG: movsd [[ONE]](%rip), %xmm1 +; CHECK-DAG: movsd [[TWO]](%rip), %xmm2 +; CHECK-DAG: movsd [[THREE]](%rip), %xmm3 +; CHECK-DAG: xorps %xmm0, %xmm0 + ret [4 x double] [double 0.0, double 1.0, double 2.0, double 3.0] +} Index: llvm/test/CodeGen/X86/tailcall-swifttailcc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcall-swifttailcc.ll @@ -0,0 +1,65 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +declare dso_local swifttailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) + +define dso_local swifttailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind { +; CHECK-LABEL: tailcaller: +; CHECK-NOT: subq +; CHECK-NOT: addq +; CHECK: jmp tailcallee +entry: + %tmp11 = tail call swifttailcc i32 @tailcallee(i32 %in1, i32 %in2, i32 %in1, i32 %in2) + ret i32 %tmp11 +} + +declare dso_local swifttailcc i8* @alias_callee() + +define swifttailcc noalias i8* @noalias_caller() nounwind { +; CHECK-LABEL: noalias_caller: +; CHECK: jmp alias_callee + %p = musttail call swifttailcc i8* @alias_callee() + ret i8* %p +} + +declare dso_local swifttailcc noalias i8* @noalias_callee() + +define dso_local swifttailcc i8* @alias_caller() nounwind { +; CHECK-LABEL: alias_caller: +; CHECK: jmp noalias_callee # TAILCALL + %p = tail call swifttailcc noalias i8* @noalias_callee() + ret i8* %p +} + +declare dso_local swifttailcc i32 @i32_callee() + +define dso_local swifttailcc i32 @ret_undef() nounwind { +; CHECK-LABEL: ret_undef: +; CHECK: jmp i32_callee # TAILCALL + %p = tail call swifttailcc i32 @i32_callee() + ret i32 undef +} + +declare dso_local swifttailcc void @does_not_return() + +define dso_local swifttailcc i32 @noret() nounwind { +; CHECK-LABEL: noret: +; CHECK: jmp does_not_return + tail call swifttailcc void @does_not_return() + unreachable +} + +define dso_local swifttailcc void @void_test(i32, i32, i32, i32) { +; CHECK-LABEL: void_test: +; CHECK: jmp void_test + entry: + tail call swifttailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3) + ret void +} + +define dso_local swifttailcc i1 @i1test(i32, i32, i32, i32) { +; CHECK-LABEL: i1test: +; CHECK: jmp i1test + entry: + %4 = tail call swifttailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3) + ret i1 %4 +} Index: llvm/utils/emacs/llvm-mode.el =================================================================== --- llvm/utils/emacs/llvm-mode.el +++ llvm/utils/emacs/llvm-mode.el @@ -57,7 +57,7 @@ ;; Calling conventions "ccc" "fastcc" "coldcc" "webkit_jscc" "anyregcc" "preserve_mostcc" "preserve_allcc" - "cxx_fast_tlscc" "swiftcc" "tailcc" "cfguard_checkcc" + "cxx_fast_tlscc" "swiftcc" "tailcc" "swifttailcc" "cfguard_checkcc" ;; Visibility styles "default" "hidden" "protected" ;; DLL storages Index: llvm/utils/vim/syntax/llvm.vim =================================================================== --- llvm/utils/vim/syntax/llvm.vim +++ llvm/utils/vim/syntax/llvm.vim @@ -176,6 +176,7 @@ \ strictfp \ swiftcc \ swifterror + \ swifttailcc \ swiftself \ syncscope \ tail