Index: llvm/docs/BitCodeFormat.rst =================================================================== --- llvm/docs/BitCodeFormat.rst +++ llvm/docs/BitCodeFormat.rst @@ -794,6 +794,7 @@ * ``preserve_allcc``: code 15 * ``swiftcc`` : code 16 * ``cxx_fast_tlscc``: code 17 + * ``tailcc`` : code 18 * ``x86_stdcallcc``: code 64 * ``x86_fastcallcc``: code 65 * ``arm_apcscc``: code 66 Index: llvm/docs/CodeGenerator.rst =================================================================== --- llvm/docs/CodeGenerator.rst +++ llvm/docs/CodeGenerator.rst @@ -2068,12 +2068,12 @@ and PowerPC if: * Caller and callee have the calling convention ``fastcc``, ``cc 10`` (GHC - calling convention) or ``cc 11`` (HiPE calling convention). + calling convention), ``cc 11`` (HiPE calling convention), or ``tailcc``. * The call is a tail call - in tail position (ret immediately follows call and ret uses value of call or is void). -* Option ``-tailcallopt`` is enabled. +* Option ``-tailcallopt`` is enabled or the calling convention is ``tailcc``. * Platform-specific constraints are met. Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -299,7 +299,7 @@ allows the target to use whatever tricks it wants to produce fast code for the target, without having to conform to an externally specified ABI (Application Binary Interface). `Tail calls can only - be optimized when this, the GHC or the HiPE convention is + be optimized when this, the tailcc, the GHC or the HiPE convention is used. `_ This calling convention does not support varargs and requires the prototype of all callees to exactly match the prototype of the function definition. @@ -436,6 +436,14 @@ - On X86-64 RCX and R8 are available for additional integer returns, and XMM2 and XMM3 are available for additional FP/vector returns. - On iOS platforms, we use AAPCS-VFP calling convention. +"``tailcc``" - Tail callable calling convention + This calling convention ensures that calls in tail position will always be + tail call optimized. This calling convention is equivalent to fastcc, + except for an additional guarantee that tail calls will be produced + whenever possible. `Tail calls can only be optimized when this, the fastcc, + the GHC or the HiPE convention is used. `_ This + calling convention does not support varargs and requires the prototype of + all callees to exactly match the prototype of the function definition. "``cc ``" - Numbered convention Any calling convention may be specified by number, allowing target-specific calling conventions to be used. Target specific @@ -10224,11 +10232,12 @@ Tail call optimization for calls marked ``tail`` is guaranteed to occur if the following conditions are met: - - Caller and callee both have the calling convention ``fastcc``. + - Caller and callee both have the calling convention ``fastcc`` or ``tailcc``. - The call is in tail position (ret immediately follows call and ret uses value of call or is void). - - Option ``-tailcallopt`` is enabled, or - ``llvm::GuaranteedTailCallOpt`` is ``true``. + - Option ``-tailcallopt`` is enabled, + ``llvm::GuaranteedTailCallOpt`` is ``true``, or the calling convention + is ``tailcc`` - `Platform-specific constraints are met. `_ Index: llvm/include/llvm/IR/CallingConv.h =================================================================== --- llvm/include/llvm/IR/CallingConv.h +++ llvm/include/llvm/IR/CallingConv.h @@ -75,6 +75,11 @@ // CXX_FAST_TLS - Calling convention for access functions. CXX_FAST_TLS = 17, + /// Tail - This calling convention attemps to make calls as fast as + /// possible while guaranteeing that tail call optimization can always + /// be performed. + Tail = 18, + // Target - This is the start of the target-specific calling conventions, // e.g. fastcall and thiscall on X86. FirstTargetCC = 64, Index: llvm/lib/AsmParser/LLLexer.cpp =================================================================== --- llvm/lib/AsmParser/LLLexer.cpp +++ llvm/lib/AsmParser/LLLexer.cpp @@ -622,6 +622,7 @@ KEYWORD(amdgpu_ps); KEYWORD(amdgpu_cs); KEYWORD(amdgpu_kernel); + KEYWORD(tailcc); KEYWORD(cc); KEYWORD(c); Index: llvm/lib/AsmParser/LLParser.cpp =================================================================== --- llvm/lib/AsmParser/LLParser.cpp +++ llvm/lib/AsmParser/LLParser.cpp @@ -1955,6 +1955,7 @@ /// ::= 'amdgpu_ps' /// ::= 'amdgpu_cs' /// ::= 'amdgpu_kernel' +/// ::= 'tailcc' /// ::= 'cc' UINT /// bool LLParser::ParseOptionalCallingConv(unsigned &CC) { @@ -2000,6 +2001,7 @@ case lltok::kw_amdgpu_ps: CC = CallingConv::AMDGPU_PS; break; case lltok::kw_amdgpu_cs: CC = CallingConv::AMDGPU_CS; break; case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break; + case lltok::kw_tailcc: CC = CallingConv::Tail; break; case lltok::kw_cc: { Lex.Lex(); return ParseUInt32(CC); Index: llvm/lib/AsmParser/LLToken.h =================================================================== --- llvm/lib/AsmParser/LLToken.h +++ llvm/lib/AsmParser/LLToken.h @@ -168,6 +168,7 @@ kw_amdgpu_ps, kw_amdgpu_cs, kw_amdgpu_kernel, + kw_tailcc, // Attributes: kw_attributes, Index: llvm/lib/CodeGen/Analysis.cpp =================================================================== --- llvm/lib/CodeGen/Analysis.cpp +++ llvm/lib/CodeGen/Analysis.cpp @@ -523,7 +523,8 @@ // longjmp on x86), it can end up causing miscompilation that has not // been fully understood. if (!Ret && - (!TM.Options.GuaranteedTailCallOpt || !isa(Term))) + ((!TM.Options.GuaranteedTailCallOpt && + CS.getCallingConv() != CallingConv::Tail) || !isa(Term))) return false; // If I will have a chain, make sure no other instruction that will have a Index: llvm/lib/IR/AsmWriter.cpp =================================================================== --- llvm/lib/IR/AsmWriter.cpp +++ llvm/lib/IR/AsmWriter.cpp @@ -352,6 +352,7 @@ case CallingConv::PreserveAll: Out << "preserve_allcc"; break; case CallingConv::CXX_FAST_TLS: Out << "cxx_fast_tlscc"; break; case CallingConv::GHC: Out << "ghccc"; break; + case CallingConv::Tail: Out << "tailcc"; break; case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break; case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break; case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break; Index: llvm/lib/Target/X86/X86CallingConv.td =================================================================== --- llvm/lib/Target/X86/X86CallingConv.td +++ llvm/lib/Target/X86/X86CallingConv.td @@ -433,6 +433,7 @@ def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. CCIfCC<"CallingConv::Fast", CCDelegateTo>, + CCIfCC<"CallingConv::Tail", CCDelegateTo>, // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, @@ -1000,6 +1001,7 @@ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo>, CCIfCC<"CallingConv::Fast", CCDelegateTo>, + CCIfCC<"CallingConv::Tail", CCDelegateTo>, CCIfCC<"CallingConv::GHC", CCDelegateTo>, CCIfCC<"CallingConv::HiPE", CCDelegateTo>, CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo>, Index: llvm/lib/Target/X86/X86FastISel.cpp =================================================================== --- llvm/lib/Target/X86/X86FastISel.cpp +++ llvm/lib/Target/X86/X86FastISel.cpp @@ -1160,6 +1160,7 @@ CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && + CC != CallingConv::Tail && CC != CallingConv::X86_FastCall && CC != CallingConv::X86_StdCall && CC != CallingConv::X86_ThisCall && @@ -1173,7 +1174,8 @@ // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || + CC == CallingConv::Tail) return false; // Let SDISel handle vararg functions. @@ -3157,7 +3159,7 @@ if (Subtarget->getTargetTriple().isOSMSVCRT()) return 0; if (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE) + CC == CallingConv::HiPE || CC == CallingConv::Tail) return 0; if (CS) @@ -3208,6 +3210,7 @@ default: return false; case CallingConv::C: case CallingConv::Fast: + case CallingConv::Tail: case CallingConv::WebKit_JS: case CallingConv::Swift: case CallingConv::X86_FastCall: @@ -3224,7 +3227,8 @@ // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || + CC == CallingConv::Tail) return false; // Don't know how to handle Win64 varargs yet. Nothing special needed for Index: llvm/lib/Target/X86/X86FrameLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86FrameLowering.cpp +++ llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2269,7 +2269,8 @@ bool IsNested = HasNestArgument(&MF); if (CallingConvention == CallingConv::X86_FastCall || - CallingConvention == CallingConv::Fast) { + CallingConvention == CallingConv::Fast || + CallingConvention == CallingConv::Tail) { if (IsNested) report_fatal_error("Segmented stacks does not support fastcall with " "nested function."); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3018,7 +3018,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || - CC == CallingConv::HHVM); + CC == CallingConv::HHVM || CC == CallingConv::Tail); } /// Return true if we might ever do TCO for calls with this calling convention. @@ -3044,7 +3044,7 @@ /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { - return GuaranteedTailCallOpt && canGuaranteeTCO(CC); + return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail; } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { @@ -3670,6 +3670,8 @@ bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; + bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || + CallConv == CallingConv::Tail; X86MachineFunctionInfo *X86Info = MF.getInfo(); auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); const auto *CI = dyn_cast_or_null(CLI.CS.getInstruction()); @@ -3690,8 +3692,7 @@ if (Attr.getValueAsString() == "true") isTailCall = false; - if (Subtarget.isPICStyleGOT() && - !MF.getTarget().Options.GuaranteedTailCallOpt) { + if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT // relocation, which forces early binding of the symbol. This breaks code @@ -3718,7 +3719,7 @@ // Sibcalls are automatically detected tailcalls which do not require // ABI changes. - if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) + if (!IsGuaranteeTCO && isTailCall) IsSibcall = true; if (isTailCall) @@ -3750,8 +3751,7 @@ // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (MF.getTarget().Options.GuaranteedTailCallOpt && - canGuaranteeTCO(CallConv)) + else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -4376,6 +4376,8 @@ bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); + bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || + CalleeCC == CallingConv::Tail; // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this @@ -4383,7 +4385,7 @@ if (IsCalleeWin64 != IsCallerWin64) return false; - if (DAG.getTarget().Options.GuaranteedTailCallOpt) { + if (IsGuaranteeTCO) { if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; @@ -24546,6 +24548,7 @@ case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::Fast: + case CallingConv::Tail: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; Index: llvm/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/lib/Target/X86/X86Subtarget.h +++ llvm/lib/Target/X86/X86Subtarget.h @@ -815,6 +815,7 @@ // On Win64, all these conventions just use the default convention. case CallingConv::C: case CallingConv::Fast: + case CallingConv::Tail: case CallingConv::Swift: case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: Index: llvm/test/CodeGen/X86/musttail-tailcc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/musttail-tailcc.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s -check-prefix=X32 + +; tailcc will turn all of these musttail calls into tail calls. + +declare tailcc i32 @tailcallee(i32 %a1, i32 %a2) + +define tailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind { +; X64-LABEL: tailcaller: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: popq %rax +; X64-NEXT: jmp tailcallee # TAILCALL +; +; X32-LABEL: tailcaller: +; X32: # %bb.0: # %entry +; X32-NEXT: jmp tailcallee # TAILCALL +entry: + %tmp11 = musttail call tailcc i32 @tailcallee(i32 %in1, i32 %in2) + ret i32 %tmp11 +} + +declare tailcc i8* @alias_callee() + +define tailcc noalias i8* @noalias_caller() nounwind { +; X64-LABEL: noalias_caller: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: popq %rax +; X64-NEXT: jmp alias_callee # TAILCALL +; +; X32-LABEL: noalias_caller: +; X32: # %bb.0: +; X32-NEXT: jmp alias_callee # TAILCALL + %p = musttail call tailcc i8* @alias_callee() + ret i8* %p +} + +declare tailcc noalias i8* @noalias_callee() + +define tailcc i8* @alias_caller() nounwind { +; X64-LABEL: alias_caller: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: popq %rax +; X64-NEXT: jmp noalias_callee # TAILCALL +; +; X32-LABEL: alias_caller: +; X32: # %bb.0: +; X32-NEXT: jmp noalias_callee # TAILCALL + %p = musttail call tailcc noalias i8* @noalias_callee() + ret i8* %p +} + +define tailcc void @void_test(i32, i32, i32, i32) { +; X64-LABEL: void_test: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: jmp void_test # TAILCALL +; +; X32-LABEL: void_test: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: jmp void_test # TAILCALL + entry: + musttail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3) + ret void +} + +define tailcc i1 @i1test(i32, i32, i32, i32) { +; X64-LABEL: i1test: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: jmp i1test # TAILCALL +; +; X32-LABEL: i1test: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: jmp i1test # TAILCALL + entry: + %4 = musttail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3) + ret i1 %4 +} Index: llvm/test/CodeGen/X86/tailcall-tailcc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcall-tailcc.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s -check-prefix=X32 + +; With -tailcallopt, CodeGen guarantees a tail call optimization +; for all of these. + +declare tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) + +define tailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind { +; X64-LABEL: tailcaller: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: movl %edi, %edx +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: popq %rax +; X64-NEXT: jmp tailcallee # TAILCALL +; +; X32-LABEL: tailcaller: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $16, %esp +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: addl $8, %esp +; X32-NEXT: jmp tailcallee # TAILCALL +entry: + %tmp11 = tail call tailcc i32 @tailcallee(i32 %in1, i32 %in2, i32 %in1, i32 %in2) + ret i32 %tmp11 +} + +declare tailcc i8* @alias_callee() + +define tailcc noalias i8* @noalias_caller() nounwind { +; X64-LABEL: noalias_caller: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: popq %rax +; X64-NEXT: jmp alias_callee # TAILCALL +; +; X32-LABEL: noalias_caller: +; X32: # %bb.0: +; X32-NEXT: jmp alias_callee # TAILCALL + %p = tail call tailcc i8* @alias_callee() + ret i8* %p +} + +declare tailcc noalias i8* @noalias_callee() + +define tailcc i8* @alias_caller() nounwind { +; X64-LABEL: alias_caller: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: popq %rax +; X64-NEXT: jmp noalias_callee # TAILCALL +; +; X32-LABEL: alias_caller: +; X32: # %bb.0: +; X32-NEXT: jmp noalias_callee # TAILCALL + %p = tail call tailcc noalias i8* @noalias_callee() + ret i8* %p +} + +declare tailcc i32 @i32_callee() + +define tailcc i32 @ret_undef() nounwind { +; X64-LABEL: ret_undef: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: popq %rax +; X64-NEXT: jmp i32_callee # TAILCALL +; +; X32-LABEL: ret_undef: +; X32: # %bb.0: +; X32-NEXT: jmp i32_callee # TAILCALL + %p = tail call tailcc i32 @i32_callee() + ret i32 undef +} + +declare tailcc void @does_not_return() + +define tailcc i32 @noret() nounwind { +; X64-LABEL: noret: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: popq %rax +; X64-NEXT: jmp does_not_return # TAILCALL +; +; X32-LABEL: noret: +; X32: # %bb.0: +; X32-NEXT: jmp does_not_return # TAILCALL + tail call tailcc void @does_not_return() + unreachable +} + +define tailcc void @void_test(i32, i32, i32, i32) { +; X64-LABEL: void_test: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: jmp void_test # TAILCALL +; +; X32-LABEL: void_test: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: jmp void_test # TAILCALL + entry: + tail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3) + ret void +} + +define tailcc i1 @i1test(i32, i32, i32, i32) { +; X64-LABEL: i1test: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: jmp i1test # TAILCALL +; +; X32-LABEL: i1test: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: jmp i1test # TAILCALL + entry: + %4 = tail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3) + ret i1 %4 +} Index: llvm/test/CodeGen/X86/tailcc-calleesave.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcc-calleesave.ll @@ -0,0 +1,19 @@ +; RUN: llc -mcpu=core < %s | FileCheck %s + +target triple = "i686-apple-darwin" + +declare tailcc void @foo(i32, i32, i32, i32, i32, i32) +declare i32* @bar(i32*) + +define tailcc void @hoge(i32 %b) nounwind { +; Do not overwrite pushed callee-save registers +; CHECK: pushl +; CHECK: subl $[[SIZE:[0-9]+]], %esp +; CHECK-NOT: [[SIZE]](%esp) + %a = alloca i32 + store i32 0, i32* %a + %d = tail call i32* @bar(i32* %a) nounwind + store i32 %b, i32* %d + tail call tailcc void @foo(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6) nounwind + ret void +} Index: llvm/test/CodeGen/X86/tailcc-disable-tail-calls.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcc-disable-tail-calls.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=NO-OPTION +; RUN: llc < %s -mtriple=x86_64-- -disable-tail-calls | FileCheck %s --check-prefix=DISABLE-TRUE +; RUN: llc < %s -mtriple=x86_64-- -disable-tail-calls=false | FileCheck %s --check-prefix=DISABLE-FALSE + +; Check that command line option "-disable-tail-calls" overrides function +; attribute "disable-tail-calls". + +; NO-OPTION-LABEL: {{\_?}}func_attr +; NO-OPTION: callq {{\_?}}callee + +; DISABLE-FALSE-LABEL: {{\_?}}func_attr +; DISABLE-FALSE: jmp {{\_?}}callee + +; DISABLE-TRUE-LABEL: {{\_?}}func_attr +; DISABLE-TRUE: callq {{\_?}}callee + +define tailcc i32 @func_attr(i32 %a) #0 { +entry: + %call = tail call tailcc i32 @callee(i32 %a) + ret i32 %call +} + +; NO-OPTION-LABEL: {{\_?}}func_noattr +; NO-OPTION: jmp {{\_?}}callee + +; DISABLE-FALSE-LABEL: {{\_?}}func_noattr +; DISABLE-FALSE: jmp {{\_?}}callee + +; DISABLE-TRUE-LABEL: {{\_?}}func_noattr +; DISABLE-TRUE: callq {{\_?}}callee + +define tailcc i32 @func_noattr(i32 %a) { +entry: + %call = tail call tailcc i32 @callee(i32 %a) + ret i32 %call +} + +declare tailcc i32 @callee(i32) + +attributes #0 = { "disable-tail-calls"="true" } Index: llvm/test/CodeGen/X86/tailcc-fastcc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcc-fastcc.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -tailcallopt < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64 +; RUN: llc -tailcallopt < %s -mtriple=i686-unknown-unknown | FileCheck %s -check-prefix=X32 + +; llc -tailcallopt should not enable tail calls from fastcc to tailcc or vice versa + +declare tailcc i32 @tailcallee1(i32 %a1, i32 %a2, i32 %a3, i32 %a4) + +define fastcc i32 @tailcaller1(i32 %in1, i32 %in2) nounwind { +; X64-LABEL: tailcaller1: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: movl %edi, %edx +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: callq tailcallee1 +; X64-NEXT: retq $8 +; +; X32-LABEL: tailcaller1: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %edx +; X32-NEXT: pushl %ecx +; X32-NEXT: calll tailcallee1 +; X32-NEXT: retl +entry: + %tmp11 = tail call tailcc i32 @tailcallee1(i32 %in1, i32 %in2, i32 %in1, i32 %in2) + ret i32 %tmp11 +} + +declare fastcc i32 @tailcallee2(i32 %a1, i32 %a2, i32 %a3, i32 %a4) + +define tailcc i32 @tailcaller2(i32 %in1, i32 %in2) nounwind { +; X64-LABEL: tailcaller2: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: movl %edi, %edx +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: callq tailcallee2 +; X64-NEXT: retq $8 +; +; X32-LABEL: tailcaller2: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %edx +; X32-NEXT: pushl %ecx +; X32-NEXT: calll tailcallee2 +; X32-NEXT: retl +entry: + %tmp11 = tail call fastcc i32 @tailcallee2(i32 %in1, i32 %in2, i32 %in1, i32 %in2) + ret i32 %tmp11 +} Index: llvm/test/CodeGen/X86/tailcc-fastisel.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcc-fastisel.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -fast-isel -fast-isel-abort=1 | FileCheck %s + +%0 = type { i64, i32, i8* } + +define tailcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 %arg1) nounwind { +fail: ; preds = %entry + %tmp20 = tail call tailcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 undef) ; [#uses=1] +; CHECK: jmp "_visit_array_aux<`Reference>" ## TAILCALL + ret i8* %tmp20 +} + +define i32 @foo() nounwind { +entry: + %0 = tail call i32 (...) @bar() nounwind ; [#uses=1] + ret i32 %0 +} + +declare i32 @bar(...) nounwind Index: llvm/test/CodeGen/X86/tailcc-largecode.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcc-largecode.ll @@ -0,0 +1,71 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnu -code-model=large -enable-misched=false | FileCheck %s + +declare tailcc i32 @callee(i32 %arg) +define tailcc i32 @directcall(i32 %arg) { +entry: +; This is the large code model, so &callee may not fit into the jmp +; instruction. Instead, stick it into a register. +; CHECK: movabsq $callee, [[REGISTER:%r[a-z0-9]+]] +; CHECK: jmpq *[[REGISTER]] # TAILCALL + %res = tail call tailcc i32 @callee(i32 %arg) + ret i32 %res +} + +; Check that the register used for an indirect tail call doesn't +; clobber any of the arguments. +define tailcc i32 @indirect_manyargs(i32(i32,i32,i32,i32,i32,i32,i32)* %target) { +; Adjust the stack to enter the function. (The amount of the +; adjustment may change in the future, in which case the location of +; the stack argument and the return adjustment will change too.) +; CHECK: pushq +; Put the call target into R11, which won't be clobbered while restoring +; callee-saved registers and won't be used for passing arguments. +; CHECK: movq %rdi, %rax +; Pass the stack argument. +; CHECK: movl $7, 16(%rsp) +; Pass the register arguments, in the right registers. +; CHECK: movl $1, %edi +; CHECK: movl $2, %esi +; CHECK: movl $3, %edx +; CHECK: movl $4, %ecx +; CHECK: movl $5, %r8d +; CHECK: movl $6, %r9d +; Adjust the stack to "return". +; CHECK: popq +; And tail-call to the target. +; CHECK: jmpq *%rax # TAILCALL + %res = tail call tailcc i32 %target(i32 1, i32 2, i32 3, i32 4, i32 5, + i32 6, i32 7) + ret i32 %res +} + +; Check that the register used for a direct tail call doesn't clobber +; any of the arguments. +declare tailcc i32 @manyargs_callee(i32,i32,i32,i32,i32,i32,i32) +define tailcc i32 @direct_manyargs() { +; Adjust the stack to enter the function. (The amount of the +; adjustment may change in the future, in which case the location of +; the stack argument and the return adjustment will change too.) +; CHECK: pushq +; Pass the stack argument. +; CHECK: movl $7, 16(%rsp) +; This is the large code model, so &manyargs_callee may not fit into +; the jmp instruction. Put it into a register which won't be clobbered +; while restoring callee-saved registers and won't be used for passing +; arguments. +; CHECK: movabsq $manyargs_callee, %rax +; Pass the register arguments, in the right registers. +; CHECK: movl $1, %edi +; CHECK: movl $2, %esi +; CHECK: movl $3, %edx +; CHECK: movl $4, %ecx +; CHECK: movl $5, %r8d +; CHECK: movl $6, %r9d +; Adjust the stack to "return". +; CHECK: popq +; And tail-call to the target. +; CHECK: jmpq *%rax # TAILCALL + %res = tail call tailcc i32 @manyargs_callee(i32 1, i32 2, i32 3, i32 4, + i32 5, i32 6, i32 7) + ret i32 %res +} Index: llvm/test/CodeGen/X86/tailcc-stackalign.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcc-stackalign.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -mtriple=i686-unknown-linux -no-x86-call-frame-opt | FileCheck %s +; Linux has 8 byte alignment so the params cause stack size 20, +; ensure that a normal tailcc call has matching stack size + + +define tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { + ret i32 %a3 +} + +define tailcc i32 @tailcaller(i32 %in1, i32 %in2, i32 %in3, i32 %in4) { + %tmp11 = tail call tailcc i32 @tailcallee(i32 %in1, i32 %in2, + i32 %in1, i32 %in2) + ret i32 %tmp11 +} + +define i32 @main(i32 %argc, i8** %argv) { + %tmp1 = call tailcc i32 @tailcaller( i32 1, i32 2, i32 3, i32 4 ) + ; expect match subl [stacksize] here + ret i32 0 +} + +; CHECK: calll tailcaller +; CHECK-NEXT: subl $12 Index: llvm/test/CodeGen/X86/tailcc-structret.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailcc-structret.ll @@ -0,0 +1,7 @@ +; RUN: llc < %s -mtriple=i686-unknown-linux | FileCheck %s +define tailcc { { i8*, i8* }*, i8*} @init({ { i8*, i8* }*, i8*}, i32) { +entry: + %2 = tail call tailcc { { i8*, i8* }*, i8* } @init({ { i8*, i8*}*, i8*} %0, i32 %1) + ret { { i8*, i8* }*, i8*} %2 +; CHECK: jmp init +} Index: llvm/test/CodeGen/X86/tailccbyval.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailccbyval.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=i686-unknown-linux | FileCheck %s +%struct.s = type {i32, i32, i32, i32, i32, i32, i32, i32, + i32, i32, i32, i32, i32, i32, i32, i32, + i32, i32, i32, i32, i32, i32, i32, i32 } + +define tailcc i32 @tailcallee(%struct.s* byval %a) nounwind { +entry: + %tmp2 = getelementptr %struct.s, %struct.s* %a, i32 0, i32 0 + %tmp3 = load i32, i32* %tmp2 + ret i32 %tmp3 +; CHECK: tailcallee +; CHECK: movl 4(%esp), %eax +} + +define tailcc i32 @tailcaller(%struct.s* byval %a) nounwind { +entry: + %tmp4 = tail call tailcc i32 @tailcallee(%struct.s* byval %a ) + ret i32 %tmp4 +; CHECK: tailcaller +; CHECK: jmp tailcallee +} Index: llvm/test/CodeGen/X86/tailccbyval64.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailccbyval64.ll @@ -0,0 +1,42 @@ +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s + +; FIXME: Win64 does not support byval. + +; Expect the entry point. +; CHECK-LABEL: tailcaller: + +; Expect 2 rep;movs because of tail call byval lowering. +; CHECK: rep; +; CHECK: rep; + +; A sequence of copyto/copyfrom virtual registers is used to deal with byval +; lowering appearing after moving arguments to registers. The following two +; checks verify that the register allocator changes those sequences to direct +; moves to argument register where it can (for registers that are not used in +; byval lowering - not rsi, not rdi, not rcx). +; Expect argument 4 to be moved directly to register edx. +; CHECK: movl $7, %edx + +; Expect argument 6 to be moved directly to register r8. +; CHECK: movl $17, %r8d + +; Expect not call but jmp to @tailcallee. +; CHECK: jmp tailcallee + +; Expect the trailer. +; CHECK: .size tailcaller + +%struct.s = type { i64, i64, i64, i64, i64, i64, i64, i64, + i64, i64, i64, i64, i64, i64, i64, i64, + i64, i64, i64, i64, i64, i64, i64, i64 } + +declare tailcc i64 @tailcallee(%struct.s* byval %a, i64 %val, i64 %val2, i64 %val3, i64 %val4, i64 %val5) + + +define tailcc i64 @tailcaller(i64 %b, %struct.s* byval %a) { +entry: + %tmp2 = getelementptr %struct.s, %struct.s* %a, i32 0, i32 1 + %tmp3 = load i64, i64* %tmp2, align 8 + %tmp4 = tail call tailcc i64 @tailcallee(%struct.s* byval %a , i64 %tmp3, i64 %b, i64 7, i64 13, i64 17) + ret i64 %tmp4 +} Index: llvm/test/CodeGen/X86/tailccfp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailccfp.ll @@ -0,0 +1,6 @@ +; RUN: llc < %s -mtriple=i686-- | FileCheck %s +define tailcc i32 @bar(i32 %X, i32(double, i32) *%FP) { + %Y = tail call tailcc i32 %FP(double 0.0, i32 %X) + ret i32 %Y +; CHECK: jmpl +} Index: llvm/test/CodeGen/X86/tailccfp2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailccfp2.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -mtriple=i686-- | FileCheck %s + +declare i32 @putchar(i32) + +define tailcc i32 @checktail(i32 %x, i32* %f, i32 %g) nounwind { +; CHECK-LABEL: checktail: + %tmp1 = icmp sgt i32 %x, 0 + br i1 %tmp1, label %if-then, label %if-else + +if-then: + %fun_ptr = bitcast i32* %f to i32(i32, i32*, i32)* + %arg1 = add i32 %x, -1 + call i32 @putchar(i32 90) +; CHECK: jmpl *%e{{.*}} + %res = tail call tailcc i32 %fun_ptr( i32 %arg1, i32 * %f, i32 %g) + ret i32 %res + +if-else: + ret i32 %x +} + + +define i32 @main() nounwind { + %f = bitcast i32 (i32, i32*, i32)* @checktail to i32* + %res = tail call tailcc i32 @checktail( i32 10, i32* %f,i32 10) + ret i32 %res +} Index: llvm/test/CodeGen/X86/tailccpic1.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailccpic1.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s + +; This test uses guaranteed TCO so these will be tail calls, despite the early +; binding issues. + +define protected tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { +entry: + ret i32 %a3 +} + +define tailcc i32 @tailcaller(i32 %in1, i32 %in2) { +entry: + %tmp11 = tail call tailcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 ) ; [#uses=1] + ret i32 %tmp11 +; CHECK: jmp tailcallee +} Index: llvm/test/CodeGen/X86/tailccpic2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailccpic2.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s + +define tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { +entry: + ret i32 %a3 +} + +define tailcc i32 @tailcaller(i32 %in1, i32 %in2) { +entry: + %tmp11 = tail call tailcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 ) ; [#uses=1] + ret i32 %tmp11 +; CHECK: movl tailcallee@GOT +; CHECK: jmpl +} + Index: llvm/test/CodeGen/X86/tailccstack64.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tailccstack64.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -post-RA-scheduler=true | FileCheck %s +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 -post-RA-scheduler=true | FileCheck %s + +; FIXME: Redundant unused stack allocation could be eliminated. +; CHECK: subq ${{24|72|80}}, %rsp + +; Check that lowered arguments on the stack do not overwrite each other. +; Add %in1 %p1 to a different temporary register (%eax). +; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..]] +; Move param %in1 to temp register (%r10d). +; CHECK: movl [[A2:40|152]](%rsp), [[R2:%[a-z0-9]+]] +; Add %in1 %p1 to a different temporary register (%eax). +; CHECK: addl {{%edi|%ecx}}, [[R1]] +; Move param %in2 to stack. +; CHECK-DAG: movl [[R2]], [[A1]](%rsp) +; Move result of addition to stack. +; CHECK-DAG: movl [[R1]], [[A2]](%rsp) +; Eventually, do a TAILCALL +; CHECK: TAILCALL + +declare tailcc i32 @tailcallee(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %a, i32 %b) nounwind + +define tailcc i32 @tailcaller(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %in1, i32 %in2) nounwind { +entry: + %tmp = add i32 %in1, %p1 + %retval = tail call tailcc i32 @tailcallee(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %in2,i32 %tmp) + ret i32 %retval +} Index: llvm/utils/vim/syntax/llvm.vim =================================================================== --- llvm/utils/vim/syntax/llvm.vim +++ llvm/utils/vim/syntax/llvm.vim @@ -82,6 +82,7 @@ \ externally_initialized \ extern_weak \ fastcc + \ tailcc \ filter \ from \ gc