Index: llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h +++ llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h @@ -239,6 +239,9 @@ /// True if the function contains a call to the llvm.vastart intrinsic. bool HasVAStart; + /// True if this is a varargs function that contains a musttail call. + bool HasMustTailInVarArgFunc; + const TargetFrameLowering *getFrameLowering() const; public: explicit MachineFrameInfo(const TargetMachine &TM, bool RealignOpt) @@ -260,6 +263,7 @@ UseLocalStackAllocationBlock = false; HasInlineAsmWithSPAdjust = false; HasVAStart = false; + HasMustTailInVarArgFunc = false; } /// hasStackObjects - Return true if there are any stack objects in this @@ -483,6 +487,10 @@ bool hasVAStart() const { return HasVAStart; } void setHasVAStart(bool B) { HasVAStart = B; } + /// Returns true if the function is variadic and contains a musttail call. + bool hasMustTailInVarArgFunc() const { return HasMustTailInVarArgFunc; } + void setHasMustTailInVarArgFunc(bool B) { HasMustTailInVarArgFunc = B; } + /// getMaxCallFrameSize - Return the maximum size of a call frame that must be /// allocated for an outgoing function call. This is only available if /// CallFrameSetup/Destroy pseudo instructions are used by the target, and Index: llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -146,6 +146,13 @@ MF->getFrameInfo()->setHasVAStart(true); } + // If we have a musttail call in a variadic funciton, we need to ensure we + // forward implicit register parameters. + if (auto *CI = dyn_cast(I)) { + if (CI->isMustTailCall() && Fn->isVarArg()) + MF->getFrameInfo()->setHasMustTailInVarArgFunc(true); + } + // Mark values used outside their block as exported, by allocating // a virtual register for them. if (isUsedOutsideOfDefiningBlock(I)) Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -2326,6 +2326,52 @@ } } +static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + + if (Subtarget->isCallingConvWin64(CallConv)) { + static const MCPhysReg GPR64ArgRegsWin64[] = { + X86::RCX, X86::RDX, X86::R8, X86::R9 + }; + return GPR64ArgRegsWin64; + } + + static const MCPhysReg GPR64ArgRegs64Bit[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + return GPR64ArgRegs64Bit; +} + +static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, + CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + if (Subtarget->isCallingConvWin64(CallConv)) { + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + return None; + } + + const Function *Fn = MF.getFunction(); + bool NoImplicitFloatOps = Fn->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && + "SSE register cannot be used when SSE is disabled!"); + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasSSE1()) + // Kernel mode asks for SSE to be disabled, so there are no XMM argument + // registers. + return None; + + static const MCPhysReg XMMArgRegs64Bit[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + return XMMArgRegs64Bit; +} + SDValue X86TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, @@ -2469,57 +2515,49 @@ // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. We // can skip this if there are no va_start calls. - if (isVarArg && MFI->hasVAStart()) { - if (Is64Bit || (CallConv != CallingConv::X86_FastCall && - CallConv != CallingConv::X86_ThisCall)) { - FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); - } - if (Is64Bit) { - unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; - - // FIXME: We should really autogenerate these arrays - static const MCPhysReg GPR64ArgRegsWin64[] = { - X86::RCX, X86::RDX, X86::R8, X86::R9 - }; - static const MCPhysReg GPR64ArgRegs64Bit[] = { - X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 - }; - static const MCPhysReg XMMArgRegs64Bit[] = { - X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, - X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 - }; - const MCPhysReg *GPR64ArgRegs; - unsigned NumXMMRegs = 0; - - if (IsWin64) { - // The XMM registers which might contain var arg parameters are shadowed - // in their paired GPR. So we only need to save the GPR to their home - // slots. - TotalNumIntRegs = 4; - GPR64ArgRegs = GPR64ArgRegsWin64; - } else { - TotalNumIntRegs = 6; TotalNumXMMRegs = 8; - GPR64ArgRegs = GPR64ArgRegs64Bit; - - NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, - TotalNumXMMRegs); + if (MFI->hasVAStart() && + (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall))) { + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(1, StackSize, true)); + } + + // 64-bit calling conventions support varargs and register parameters, so we + // have to do extra work to spill them in the prologue or forward them to + // musttail calls. + if (Is64Bit && isVarArg && + (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) { + // Find the first unallocated argument registers. + ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); + ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); + unsigned NumIntRegs = + CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size()); + unsigned NumXMMRegs = + CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size()); + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + + // Gather all the live in physical registers. + SmallVector LiveGPRs; + SmallVector LiveXMMRegs; + SDValue ALVal; + for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { + unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); + LiveGPRs.push_back( + DAG.getCopyFromReg(DAG.getEntryNode(), dl, GPR, MVT::i64)); + } + if (!ArgXMMs.empty()) { + unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); + for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { + unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); + LiveXMMRegs.push_back( + DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); } - unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, - TotalNumIntRegs); - - bool NoImplicitFloatOps = Fn->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && - "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && - NoImplicitFloatOps) && - "SSE register cannot be used when SSE is disabled!"); - if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || - !Subtarget->hasSSE1()) - // Kernel mode asks for SSE to be disabled, so don't push them - // on the stack. - TotalNumXMMRegs = 0; + } + // Store them to the va_list returned by va_start. + if (MFI->hasVAStart()) { if (IsWin64) { const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); // Get to the caller-allocated home save location. Add 8 to account @@ -2535,10 +2573,9 @@ // registers, then we must store them to their spots on the stack so // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex( - MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, - false)); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } // Store the integer parameter registers. @@ -2546,12 +2583,9 @@ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { + for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, DAG.getIntPtrConstant(Offset)); - unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], - &X86::GR64RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( @@ -2561,32 +2595,52 @@ Offset += 8; } - if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { // Now store the XMM (fp + vector) parameter registers. SmallVector SaveXMMOps; SaveXMMOps.push_back(Chain); - - unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); - SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); SaveXMMOps.push_back(ALVal); - SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex())); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset())); - - for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { - unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], - &X86::VR128RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); - SaveXMMOps.push_back(Val); - } + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + } else { + // TODO: Save virtual registers away some where so we can do + // getCopyFromReg in the musttail call lowering bb. + assert(MFI->hasMustTailInVarArgFunc()); + auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); + typedef X86MachineFunctionInfo::Forward Forward; + + // Add all GPRs, al, and XMMs to the list of forwards. + for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) { + unsigned VReg = + MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass); + Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]); + Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64)); + } + + if (!ArgXMMs.empty()) { + unsigned ALVReg = + MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass); + Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal); + Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8)); + + for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) { + unsigned VReg = + MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass); + Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]); + Forwards.push_back( + Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32)); + } + } } } @@ -2689,6 +2743,7 @@ bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; + X86MachineFunctionInfo *X86Info = MF.getInfo(); if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; @@ -2741,7 +2796,6 @@ int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. - X86MachineFunctionInfo *X86Info = MF.getInfo(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; @@ -2884,7 +2938,7 @@ } } - if (Is64Bit && isVarArg && !IsWin64) { + if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -2906,6 +2960,14 @@ DAG.getConstant(NumXMMRegs, MVT::i8))); } + if (Is64Bit && isVarArg && IsMustTail) { + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + for (const auto &F : Forwards) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + } + } + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. Index: llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h +++ llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h @@ -15,6 +15,8 @@ #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineValueType.h" +#include namespace llvm { @@ -70,6 +72,22 @@ unsigned NumLocalDynamics; public: + /// Describes a register that needs to be forwarded from the prologue to a + /// musttail call. + struct Forward { + Forward(unsigned VReg, MCPhysReg PReg, MVT VT) + : VReg(VReg), PReg(PReg), VT(VT) {} + unsigned VReg; + MCPhysReg PReg; + MVT VT; + }; + +private: + /// ForwardedMustTailRegParms - A list of virtual and physical registers + /// that must be forwarded to every musttail call. + std::vector ForwardedMustTailRegParms; + +public: X86MachineFunctionInfo() : ForceFramePointer(false), CalleeSavedFrameSize(0), BytesToPopOnReturn(0), @@ -138,6 +156,9 @@ unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } + std::vector &getForwardedMustTailRegParms() { + return ForwardedMustTailRegParms; + } }; } // End llvm namespace Index: llvm/trunk/test/CodeGen/X86/musttail-varargs.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/musttail-varargs.ll +++ llvm/trunk/test/CodeGen/X86/musttail-varargs.ll @@ -0,0 +1,119 @@ +; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX +; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS + +; Test that we actually spill and reload all arguments in the variadic argument +; pack. Doing a normal call will clobber all argument registers, and we will +; spill around it. A simple adjustment should not require any XMM spills. + +declare void(i8*, ...)* @get_f(i8* %this) + +define void @f_thunk(i8* %this, ...) { + %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this) + musttail call void (i8*, ...)* %fptr(i8* %this, ...) + ret void +} + +; Save and restore 6 GPRs, 8 XMMs, and AL around the call. + +; LINUX-LABEL: f_thunk: +; LINUX-DAG: movq %rdi, {{.*}} +; LINUX-DAG: movq %rsi, {{.*}} +; LINUX-DAG: movq %rdx, {{.*}} +; LINUX-DAG: movq %rcx, {{.*}} +; LINUX-DAG: movq %r8, {{.*}} +; LINUX-DAG: movq %r9, {{.*}} +; LINUX-DAG: movb %al, {{.*}} +; LINUX-DAG: movaps %xmm0, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm1, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm2, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm3, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm4, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm5, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm6, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm7, {{[0-9]*}}(%rsp) +; LINUX: callq get_f +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm0 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm1 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm2 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm3 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm4 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm5 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm6 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm7 +; LINUX-DAG: movq {{.*}}, %rdi +; LINUX-DAG: movq {{.*}}, %rsi +; LINUX-DAG: movq {{.*}}, %rdx +; LINUX-DAG: movq {{.*}}, %rcx +; LINUX-DAG: movq {{.*}}, %r8 +; LINUX-DAG: movq {{.*}}, %r9 +; LINUX-DAG: movb {{.*}}, %al +; LINUX: jmpq *{{.*}} # TAILCALL + +; WINDOWS-LABEL: f_thunk: +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS-DAG: movq %rdx, {{.*}} +; WINDOWS-DAG: movq %rcx, {{.*}} +; WINDOWS-DAG: movq %r8, {{.*}} +; WINDOWS-DAG: movq %r9, {{.*}} +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS: callq get_f +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS-DAG: movq {{.*}}, %rdx +; WINDOWS-DAG: movq {{.*}}, %rcx +; WINDOWS-DAG: movq {{.*}}, %r8 +; WINDOWS-DAG: movq {{.*}}, %r9 +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS: jmpq *{{.*}} # TAILCALL + +; This thunk shouldn't require any spills and reloads, assuming the register +; allocator knows what it's doing. + +define void @g_thunk(i8* %fptr_i8, ...) { + %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)* + musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...) + ret void +} + +; LINUX-LABEL: g_thunk: +; LINUX-NOT: movq +; LINUX: jmpq *%rdi # TAILCALL + +; WINDOWS-LABEL: g_thunk: +; WINDOWS-NOT: movq +; WINDOWS: jmpq *%rcx # TAILCALL + +; Do a simple multi-exit multi-bb test. + +%struct.Foo = type { i1, i8*, i8* } + +@g = external global i32 + +define void @h_thunk(%struct.Foo* %this, ...) { + %cond_p = getelementptr %struct.Foo* %this, i32 0, i32 0 + %cond = load i1* %cond_p + br i1 %cond, label %then, label %else + +then: + %a_p = getelementptr %struct.Foo* %this, i32 0, i32 1 + %a_i8 = load i8** %a_p + %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)* + musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...) + ret void + +else: + %b_p = getelementptr %struct.Foo* %this, i32 0, i32 2 + %b_i8 = load i8** %b_p + %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)* + store i32 42, i32* @g + musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...) + ret void +} + +; LINUX-LABEL: h_thunk: +; LINUX: jne +; LINUX: jmpq *{{.*}} # TAILCALL +; LINUX: jmpq *{{.*}} # TAILCALL +; WINDOWS-LABEL: h_thunk: +; WINDOWS: jne +; WINDOWS: jmpq *{{.*}} # TAILCALL +; WINDOWS: jmpq *{{.*}} # TAILCALL