Index: lib/Target/ARM64/ARM64FrameLowering.cpp =================================================================== --- lib/Target/ARM64/ARM64FrameLowering.cpp +++ lib/Target/ARM64/ARM64FrameLowering.cpp @@ -107,32 +107,38 @@ void ARM64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); const ARM64InstrInfo *TII = static_cast(MF.getTarget().getInstrInfo()); + DebugLoc DL = I->getDebugLoc(); + int Opc = I->getOpcode(); + bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); + uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; + + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); if (!TFI->hasReservedCallFrame(MF)) { - // If we have alloca, convert as follows: - // ADJCALLSTACKDOWN -> sub, sp, sp, amount - // ADJCALLSTACKUP -> add, sp, sp, amount - MachineInstr *Old = I; - DebugLoc DL = Old->getDebugLoc(); - unsigned Amount = Old->getOperand(0).getImm(); - if (Amount != 0) { - // We need to keep the stack aligned properly. To do this, we round the - // amount of space needed for the outgoing arguments up to the next - // alignment boundary. - unsigned Align = TFI->getStackAlignment(); - Amount = (Amount + Align - 1) / Align * Align; - - // Replace the pseudo instruction with a new instruction... - unsigned Opc = Old->getOpcode(); - if (Opc == ARM64::ADJCALLSTACKDOWN) { - emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII); - } else { - assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP"); - emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII); - } + unsigned Align = getStackAlignment(); + + int64_t Amount = I->getOperand(0).getImm(); + Amount = RoundUpToAlignment(Amount, Align); + if (!IsDestroy) + Amount = -Amount; + + // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it + // doesn't have to pop anything), then the first operand will be zero too so + // this adjustment is a no-op. + if (CalleePopAmount == 0) { + // FIXME: in-function stack adjustment for calls is limited to 12-bits + // because there's no guaranteed temporary register available. Mostly call + // frames will be allocated at the start of a function so this is OK, but + // it is a limitation that needs dealing with. + assert(Amount > -0xfff && Amount < 0xfff && "call frame too large"); + emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII); } + } else if (CalleePopAmount != 0) { + // If the calling convention demands that the callee pops arguments from the + // stack, we want to add it back if we have a reserved call frame. + assert(CalleePopAmount < 0xfff && "call frame too large"); + emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -CalleePopAmount, TII); } MBB.erase(I); } @@ -420,8 +426,57 @@ const ARM64RegisterInfo *RegInfo = static_cast(MF.getTarget().getRegisterInfo()); DebugLoc DL = MBBI->getDebugLoc(); + unsigned RetOpcode = MBBI->getOpcode(); int NumBytes = MFI->getStackSize(); + const ARM64FunctionInfo *AFI = MF.getInfo(); + + // Initial and residual are named for consitency with the prologue. Note that + // in the epilogue, the residual adjustment is executed first. + uint64_t ArgumentPopSize = 0; + if (RetOpcode == ARM64::TCRETURNdi || RetOpcode == ARM64::TCRETURNri) { + MachineOperand &StackAdjust = MBBI->getOperand(1); + + // For a tail-call in a callee-pops-arguments environment, some or all of + // the stack may actually be in use for the call's arguments, this is + // calculated during LowerCall and consumed here... + ArgumentPopSize = StackAdjust.getImm(); + } else { + // ... otherwise the amount to pop is *all* of the argument space, + // conveniently stored in the MachineFunctionInfo by + // LowerFormalArguments. This will, of course, be zero for the C calling + // convention. + ArgumentPopSize = AFI->getArgumentStackToRestore(); + } + + // The stack frame should be like below, + // + // ---------------------- --- + // | | | + // | BytesInStackArgArea| CalleeArgStackSize + // | (NumReusableBytes) | (of tail call) + // | | --- + // | | | + // ---------------------| --- | + // | | | | + // | CalleeSavedReg | | | + // | (NumRestores * 16) | | | + // | | | | + // ---------------------| | NumBytes + // | | StackSize (StackAdjustUp) + // | LocalStackSize | | | + // | (covering callee | | | + // | args) | | | + // | | | | + // ---------------------- --- --- + // + // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize + // = StackSize + ArgumentPopSize + // + // ARM64TargetLowering::LowerCall figures out ArgumentPopSize and keeps + // it as the 2nd argument of ARM64ISD::TC_RETURN. + NumBytes += ArgumentPopSize; + unsigned NumRestores = 0; // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBBI; Index: lib/Target/ARM64/ARM64ISelLowering.h =================================================================== --- lib/Target/ARM64/ARM64ISelLowering.h +++ lib/Target/ARM64/ARM64ISelLowering.h @@ -353,6 +353,16 @@ const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const; + /// Finds the incoming stack arguments which overlap the given fixed stack + /// object and incorporates their load into the current chain. This prevents + /// an upcoming store from clobbering the stack argument before it's used. + SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, + MachineFrameInfo *MFI, int ClobberedFI) const; + + bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; + + bool IsTailCallConvention(CallingConv::ID CallCC) const; + void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, SDValue &Chain) const; Index: lib/Target/ARM64/ARM64ISelLowering.cpp =================================================================== --- lib/Target/ARM64/ARM64ISelLowering.cpp +++ lib/Target/ARM64/ARM64ISelLowering.cpp @@ -39,12 +39,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); -// This option should go away when tail calls fully work. -static cl::opt -EnableARM64TailCalls("arm64-tail-calls", cl::Hidden, - cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."), - cl::init(true)); - enum AlignMode { StrictAlign, NoStrictAlign @@ -1811,6 +1805,27 @@ AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); } + ARM64FunctionInfo *FuncInfo = MF.getInfo(); + unsigned StackArgSize = CCInfo.getNextStackOffset(); + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { + // This is a non-standard ABI so by fiat I say we're allowed to make full + // use of the stack area to be popped, which must be aligned to 16 bytes in + // any case: + StackArgSize = RoundUpToAlignment(StackArgSize, 16); + + // If we're expected to restore the stack (e.g. fastcc) then we'll be adding + // a multiple of 16. + FuncInfo->setArgumentStackToRestore(StackArgSize); + + // This realignment carries over to the available bytes below. Our own + // callers will guarantee the space is free by giving an aligned value to + // CALLSEQ_START. + } + // Even if we're not expected to free up the space, it's useful to know how + // much is there while considering tail calls (because we can reuse it). + FuncInfo->setBytesInStackArgArea(StackArgSize); + return Chain; } @@ -1942,57 +1957,147 @@ const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { - // Look for obvious safe cases to perform tail call optimization that do not - // require ABI changes. This is what gcc calls sibcall. - - // Do not sibcall optimize vararg calls unless the call site is not passing - // any arguments. - if (isVarArg && !Outs.empty()) + // For CallingConv::C this function knows whether the ABI needs + // changing. That's not true for other conventions so they will have to opt in + // manually. + if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; - // Also avoid sibcall optimization if either caller or callee uses struct - // return semantics. - if (isCalleeStructRet || isCallerStructRet) + const MachineFunction &MF = DAG.getMachineFunction(); + const Function *CallerF = MF.getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible (see + // X86) but less efficient and uglier in LowerCall. + for (Function::const_arg_iterator i = CallerF->arg_begin(), + e = CallerF->arg_end(); + i != e; ++i) + if (i->hasByValAttr()) + return false; + + if (getTargetMachine().Options.GuaranteedTailCallOpt) { + if (IsTailCallConvention(CalleeCC) && CCMatch) + return true; return false; + } - // Note that currently ARM64 "C" calling convention and "Fast" calling - // convention are compatible. If/when that ever changes, we'll need to - // add checks here to make sure any interactions are OK. + // Now we search for cases where we can use a tail call without changing the + // ABI. Sibcall is used in some places (particularly gcc) to refer to this + // concept. - // If the callee takes no arguments then go on to check the results of the - // call. - if (!Outs.empty()) { - // Check if stack adjustment is needed. For now, do not do this if any - // argument is passed on the stack. + // I want anyone implementing a new calling convention to think long and hard + // about this assert. + assert((!isVarArg || CalleeCC == CallingConv::C) && + "Unexpected variadic calling convention"); + + if (isVarArg && !Outs.empty()) { + // At least two cases here: if caller is fastcc then we can't have any + // memory arguments (we'd be expected to clean up the stack afterwards). If + // caller is C then we could potentially use its argument area. + + // FIXME: for now we take the most conservative of these in both cases: + // disallow all variadic memory operands. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), getTargetMachine(), ArgLocs, *DAG.getContext()); - CCAssignFn *AssignFn = CCAssignFnForCall(CalleeCC, /*IsVarArg=*/false); - CCInfo.AnalyzeCallOperands(Outs, AssignFn); - if (CCInfo.getNextStackOffset()) { - // Check if the arguments are already laid out in the right way as - // the caller's fixed stack objects. - for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; - ++i, ++realArgIdx) { - CCValAssign &VA = ArgLocs[i]; - if (VA.getLocInfo() == CCValAssign::Indirect) - return false; - if (VA.needsCustom()) { - // Just don't handle anything that needs custom adjustments for now. - // If need be, we can revisit later, but we shouldn't ever end up - // here. + + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector RVLocs1; + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); + + SmallVector RVLocs2; + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) return false; - } else if (!VA.isRegLoc()) { - // Likewise, don't try to handle stack based arguments for the - // time being. + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) return false; - } } } } - return true; + // Nothing more to check if the callee is taking no arguments + if (Outs.empty()) + return true; + + SmallVector ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); + + const ARM64FunctionInfo *FuncInfo = MF.getInfo(); + + // If the stack arguments for this call would fit into our own save area then + // the call can be made tail. + return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); +} + +SDValue ARM64TargetLowering::addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo *MFI, + int ClobberedFI) const { + SmallVector ArgChains; + int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; + + // Include the original chain at the beginning of the list. When this is + // used by target LowerCall hooks, this helps legalize find the + // CALLSEQ_BEGIN node. + ArgChains.push_back(Chain); + + // Add a chain value for each stack argument corresponding + for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), + UE = DAG.getEntryNode().getNode()->use_end(); + U != UE; ++U) + if (LoadSDNode *L = dyn_cast(*U)) + if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) + if (FI->getIndex() < 0) { + int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); + int64_t InLastByte = InFirstByte; + InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; + + if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || + (FirstByte <= InFirstByte && InFirstByte <= LastByte)) + ArgChains.push_back(SDValue(L, 1)); + } + + // Build a tokenfactor for all the chains. + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} + +bool ARM64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, + bool TailCallOpt) const { + return CallCC == CallingConv::Fast && TailCallOpt; } + +bool ARM64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { + return CallCC == CallingConv::Fast; +} + /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, /// and add input and output parameter nodes. SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI, @@ -2012,9 +2117,9 @@ bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool IsThisReturn = false; - // If tail calls are explicitly disabled, make sure not to use them. - if (!EnableARM64TailCalls) - IsTailCall = false; + ARM64FunctionInfo *FuncInfo = MF.getInfo(); + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + bool IsSibCall = false; if (IsTailCall) { // Check if it's really possible to do a tail call. @@ -2024,9 +2129,12 @@ if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); - // We don't support GuaranteedTailCallOpt, only automatically - // detected sibcalls. - // FIXME: Re-evaluate. Is this true? Should it be true? + + // A sibling call is one where we're under the usual C ABI and not planning + // to change that but can still do a tail call: + if (!TailCallOpt && IsTailCall) + IsSibCall = true; + if (IsTailCall) ++NumTailCalls; } @@ -2082,9 +2190,42 @@ // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); + if (IsSibCall) { + // Since we're not changing the ABI to make this a tail call, the memory + // operands are already available in the caller's incoming argument space. + NumBytes = 0; + } + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. Completely unused for non-tail calls. + int FPDiff = 0; + + if (IsTailCall && !IsSibCall) { + unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); + + // Since callee will pop argument stack as a tail call, we must keep the + // popped size 16-byte aligned. + NumBytes = RoundUpToAlignment(NumBytes, 16); + + // FPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // can actually shrink the stack. + FPDiff = NumReusableBytes - NumBytes; + + // The stack pointer must be 16-byte aligned at all times it's used for a + // memory operation, which in practice means at *all* times and in + // particular across call boundaries. Therefore our own arguments started at + // a 16-byte aligned SP and the delta applied for the tail call should + // satisfy the same constraint. + assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); + } + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - if (!IsTailCall) + if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL); @@ -2134,31 +2275,50 @@ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { assert(VA.isMemLoc()); - // There's no reason we can't support stack args w/ tailcall, but - // we currently don't, so assert if we see one. - assert(!IsTailCall && "stack argument with tail call!?"); - // FIXME: This works on big-endian for composite byvals, which are the common - // case. It should also work for fundamental types too. + SDValue DstAddr; + MachinePointerInfo DstInfo; + + // FIXME: This works on big-endian for composite byvals, which are the + // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; + unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 + : VA.getLocVT().getSizeInBits(); + OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal()) { - unsigned OpSize = (VA.getLocVT().getSizeInBits() + 7) / 8; if (OpSize < 8) BEAlign = 8 - OpSize; } - unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset + BEAlign); + int32_t Offset = LocMemOffset + BEAlign; + SDValue PtrOff = DAG.getIntPtrConstant(Offset); PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); + if (IsTailCall) { + Offset = Offset + FPDiff; + int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + + DstAddr = DAG.getFrameIndex(FI, getPointerTy()); + DstInfo = MachinePointerInfo::getFixedStack(FI); + + // Make sure any stack arguments overlapping with where we're storing + // are loaded before this eventual operation. Otherwise they'll be + // clobbered. + Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); + } else { + SDValue PtrOff = DAG.getIntPtrConstant(Offset); + + DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); + DstInfo = MachinePointerInfo::getStack(LocMemOffset); + } + if (Outs[i].Flags.isByVal()) { SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); SDValue Cpy = DAG.getMemcpy( - Chain, DL, PtrOff, Arg, SizeNode, Outs[i].Flags.getByValAlign(), + Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), /*isVolatile = */ false, - /*alwaysInline = */ false, - MachinePointerInfo::getStack(LocMemOffset), MachinePointerInfo()); + /*alwaysInline = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { @@ -2171,9 +2331,8 @@ VA.getLocVT() == MVT::i16)) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg); - SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff, - MachinePointerInfo::getStack(LocMemOffset), - false, false, 0); + SDValue Store = + DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); MemOpChains.push_back(Store); } } @@ -2221,10 +2380,27 @@ Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0); } + // We don't usually want to end the call-sequence here because we would tidy + // the frame up *after* the call, however in the ABI-changing tail-call case + // we've carefully laid out the parameters so that when sp is reset they'll be + // in the correct location. + if (IsTailCall && !IsSibCall) { + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), InFlag, DL); + InFlag = Chain.getValue(1); + } + std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); + if (IsTailCall) { + // Each tail call may have to adjust the stack by a different amount, so + // this information must travel along with the operation for eventual + // consumption by emitEpilogue. + Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); + } + // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) @@ -2262,8 +2438,13 @@ Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); + uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) + ? RoundUpToAlignment(NumBytes, 16) + : 0; + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag, DL); + DAG.getIntPtrConstant(CalleePopBytes, true), + InFlag, DL); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -7441,9 +7622,6 @@ // return instructions to help enable tail call optimizations for this // instruction. bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!EnableARM64TailCalls) - return false; - if (!CI->isTailCall()) return false; Index: lib/Target/ARM64/ARM64InstrInfo.td =================================================================== --- lib/Target/ARM64/ARM64InstrInfo.td +++ lib/Target/ARM64/ARM64InstrInfo.td @@ -88,7 +88,7 @@ def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; -def SDT_ARM64TCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDT_ARM64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>; def SDT_ARM64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>; @@ -5163,12 +5163,15 @@ // Tail call return handling. These are all compiler pseudo-instructions, // so no encoding information or anything like that. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { - def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst), []>; - def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst), []>; + def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>; + def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>; } -def : Pat<(ARM64tcret tcGPR64:$dst), (TCRETURNri tcGPR64:$dst)>; -def : Pat<(ARM64tcret (i64 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>; -def : Pat<(ARM64tcret (i64 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>; +def : Pat<(ARM64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), + (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>; +def : Pat<(ARM64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), + (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; +def : Pat<(ARM64tcret texternalsym:$dst, (i32 timm:$FPDiff)), + (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; include "ARM64InstrAtomics.td" Index: lib/Target/ARM64/ARM64MachineFunctionInfo.h =================================================================== --- lib/Target/ARM64/ARM64MachineFunctionInfo.h +++ lib/Target/ARM64/ARM64MachineFunctionInfo.h @@ -25,6 +25,22 @@ /// contains private ARM64-specific information for each MachineFunction. class ARM64FunctionInfo : public MachineFunctionInfo { + /// Number of bytes of arguments this function has on the stack. If the callee + /// is expected to restore the argument stack this should be a multiple of 16, + /// all usable during a tail call. + /// + /// The alternative would forbid tail call optimisation in some cases: if we + /// want to transfer control from a function with 8-bytes of stack-argument + /// space to a function with 16-bytes then misalignment of this value would + /// make a stack adjustment necessary, which could not be undone by the + /// callee. + unsigned BytesInStackArgArea; + + /// The number of bytes to restore to deallocate space for incoming + /// arguments. Canonically 0 in the C calling convention, but non-zero when + /// callee is expected to pop the args. + unsigned ArgumentStackToRestore; + /// HasStackFrame - True if this function has a stack frame. Set by /// processFunctionBeforeCalleeSavedScan(). bool HasStackFrame; @@ -58,17 +74,25 @@ public: ARM64FunctionInfo() - : HasStackFrame(false), NumLocalDynamicTLSAccesses(0), - VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0), - VarArgsFPRIndex(0), VarArgsFPRSize(0) {} + : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), + NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {} explicit ARM64FunctionInfo(MachineFunction &MF) - : HasStackFrame(false), NumLocalDynamicTLSAccesses(0), - VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0), - VarArgsFPRIndex(0), VarArgsFPRSize(0) { + : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), + NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) { (void)MF; } + unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } + void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; } + + unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; } + void setArgumentStackToRestore(unsigned bytes) { + ArgumentStackToRestore = bytes; + } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } Index: test/CodeGen/AArch64/fastcc-reserved.ll =================================================================== --- test/CodeGen/AArch64/fastcc-reserved.ll +++ test/CodeGen/AArch64/fastcc-reserved.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefix=CHECK-ARM64 ; This test is designed to be run in the situation where the ; call-frame is not reserved (hence disable-fp-elim), but where @@ -14,20 +15,28 @@ ; Normal frame setup stuff: ; CHECK: sub sp, sp, ; CHECK: stp x29, x30 +; CHECK-ARM64: stp x29, x30, [sp, #-16]! +; CHECK-ARM64: mov x29, sp ; Reserve space for call-frame: ; CHECK: sub sp, sp, #16 +; CHECK-ARM64: sub sp, sp, #16 call fastcc void @will_pop([8 x i32] undef, i32 42) ; CHECK: bl will_pop +; CHECK-ARM64: bl will_pop ; Since @will_pop is fastcc with tailcallopt, it will put the stack ; back where it needs to be, we shouldn't duplicate that ; CHECK-NOT: sub sp, sp, #16 ; CHECK-NOT: add sp, sp, +; CHECK-ARM64-NOT: sub sp, sp, #16 +; CHECK-ARM64-NOT: add sp, sp, ; CHECK: ldp x29, x30 ; CHECK: add sp, sp, +; CHECK-ARM64: mov sp, x29 +; CHECK-ARM64: ldp x29, x30, [sp], #16 ret void } @@ -40,19 +49,26 @@ ; Normal frame setup again ; CHECK: sub sp, sp, ; CHECK: stp x29, x30 +; CHECK-ARM64: stp x29, x30, [sp, #-16]! +; CHECK-ARM64: mov x29, sp ; Reserve space for call-frame ; CHECK: sub sp, sp, #16 +; CHECK-ARM64: sub sp, sp, #16 call void @wont_pop([8 x i32] undef, i32 42) ; CHECK: bl wont_pop +; CHECK-ARM64: bl wont_pop ; This time we *do* need to unreserve the call-frame ; CHECK: add sp, sp, #16 +; CHECK-ARM64: add sp, sp, #16 ; Check for epilogue (primarily to make sure sp spotted above wasn't ; part of it). ; CHECK: ldp x29, x30 ; CHECK: add sp, sp, +; CHECK-ARM64: mov sp, x29 +; CHECK-ARM64: ldp x29, x30, [sp], #16 ret void } Index: test/CodeGen/AArch64/fastcc.ll =================================================================== --- test/CodeGen/AArch64/fastcc.ll +++ test/CodeGen/AArch64/fastcc.ll @@ -1,5 +1,7 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL +; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-ARM64-TAIL ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-ARM64 %s ; Without tailcallopt fastcc still means the caller cleans up the ; stack, so try to make sure this is respected. @@ -8,116 +10,219 @@ ; CHECK-LABEL: func_stack0: ; CHECK: sub sp, sp, #48 +; CHECK-ARM64-LABEL: func_stack0: +; CHECK-ARM64: stp x29, x30, [sp, #-16]! +; CHECK-ARM64-NEXT: mov x29, sp +; CHECK-ARM64-NEXT: sub sp, sp, #32 + ; CHECK-TAIL-LABEL: func_stack0: ; CHECK-TAIL: sub sp, sp, #48 +; CHECK-ARM64-TAIL-LABEL: func_stack0: +; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]! +; CHECK-ARM64-TAIL-NEXT: mov x29, sp +; CHECK-ARM64-TAIL-NEXT: sub sp, sp, #32 + call fastcc void @func_stack8([8 x i32] undef, i32 42) ; CHECK: bl func_stack8 ; CHECK-NOT: sub sp, sp, +; CHECK-ARM64: bl func_stack8 +; CHECK-ARM64-NOT: sub sp, sp, + ; CHECK-TAIL: bl func_stack8 ; CHECK-TAIL: sub sp, sp, #16 +; CHECK-ARM64-TAIL: bl func_stack8 +; CHECK-ARM64-TAIL: sub sp, sp, #16 + call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9) ; CHECK: bl func_stack32 ; CHECK-NOT: sub sp, sp, +; CHECK-ARM64: bl func_stack32 +; CHECK-ARM64-NOT: sub sp, sp, + ; CHECK-TAIL: bl func_stack32 ; CHECK-TAIL: sub sp, sp, #32 +; CHECK-ARM64-TAIL: bl func_stack32 +; CHECK-ARM64-TAIL: sub sp, sp, #32 + call fastcc void @func_stack0() ; CHECK: bl func_stack0 ; CHECK-NOT: sub sp, sp +; CHECK-ARM64: bl func_stack0 +; CHECK-ARM64-NOT: sub sp, sp + ; CHECK-TAIL: bl func_stack0 ; CHECK-TAIL-NOT: sub sp, sp +; CHECK-ARM64-TAIL: bl func_stack0 +; CHECK-ARM64-TAIL-NOT: sub sp, sp + ret void ; CHECK: add sp, sp, #48 ; CHECK-NEXT: ret +; CHECK-ARM64: mov sp, x29 +; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16 +; CHECK-ARM64-NEXT: ret + ; CHECK-TAIL: add sp, sp, #48 ; CHECK-TAIL-NEXT: ret +; CHECK-ARM64-TAIL: mov sp, x29 +; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16 +; CHECK-ARM64-TAIL-NEXT: ret } define fastcc void @func_stack8([8 x i32], i32 %stacked) { ; CHECK-LABEL: func_stack8: ; CHECK: sub sp, sp, #48 +; CHECK-ARM64-LABEL: func_stack8: +; CHECK-ARM64: stp x29, x30, [sp, #-16]! +; CHECK-ARM64: mov x29, sp +; CHECK-ARM64: sub sp, sp, #32 + ; CHECK-TAIL-LABEL: func_stack8: ; CHECK-TAIL: sub sp, sp, #48 +; CHECK-ARM64-TAIL-LABEL: func_stack8: +; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]! +; CHECK-ARM64-TAIL: mov x29, sp +; CHECK-ARM64-TAIL: sub sp, sp, #32 + call fastcc void @func_stack8([8 x i32] undef, i32 42) ; CHECK: bl func_stack8 ; CHECK-NOT: sub sp, sp, +; CHECK-ARM64: bl func_stack8 +; CHECK-ARM64-NOT: sub sp, sp, + ; CHECK-TAIL: bl func_stack8 ; CHECK-TAIL: sub sp, sp, #16 +; CHECK-ARM64-TAIL: bl func_stack8 +; CHECK-ARM64-TAIL: sub sp, sp, #16 + call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9) ; CHECK: bl func_stack32 ; CHECK-NOT: sub sp, sp, +; CHECK-ARM64: bl func_stack32 +; CHECK-ARM64-NOT: sub sp, sp, + ; CHECK-TAIL: bl func_stack32 ; CHECK-TAIL: sub sp, sp, #32 +; CHECK-ARM64-TAIL: bl func_stack32 +; CHECK-ARM64-TAIL: sub sp, sp, #32 + call fastcc void @func_stack0() ; CHECK: bl func_stack0 ; CHECK-NOT: sub sp, sp +; CHECK-ARM64: bl func_stack0 +; CHECK-ARM64-NOT: sub sp, sp + ; CHECK-TAIL: bl func_stack0 ; CHECK-TAIL-NOT: sub sp, sp +; CHECK-ARM64-TAIL: bl func_stack0 +; CHECK-ARM64-TAIL-NOT: sub sp, sp + ret void ; CHECK: add sp, sp, #48 ; CHECK-NEXT: ret +; CHECK-ARM64: mov sp, x29 +; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16 +; CHECK-ARM64-NEXT: ret + ; CHECK-TAIL: add sp, sp, #64 ; CHECK-TAIL-NEXT: ret + +; CHECK-ARM64-TAIL: mov sp, x29 +; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16 +; CHECK-ARM64-TAIL-NEXT: ret } define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) { ; CHECK-LABEL: func_stack32: ; CHECK: sub sp, sp, #48 +; CHECK-ARM64-LABEL: func_stack32: +; CHECK-ARM64: mov x29, sp + ; CHECK-TAIL-LABEL: func_stack32: ; CHECK-TAIL: sub sp, sp, #48 +; CHECK-ARM64-TAIL-LABEL: func_stack32: +; CHECK-ARM64-TAIL: mov x29, sp + call fastcc void @func_stack8([8 x i32] undef, i32 42) ; CHECK: bl func_stack8 ; CHECK-NOT: sub sp, sp, +; CHECK-ARM64: bl func_stack8 +; CHECK-ARM64-NOT: sub sp, sp, + ; CHECK-TAIL: bl func_stack8 ; CHECK-TAIL: sub sp, sp, #16 +; CHECK-ARM64-TAIL: bl func_stack8 +; CHECK-ARM64-TAIL: sub sp, sp, #16 + call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9) ; CHECK: bl func_stack32 ; CHECK-NOT: sub sp, sp, +; CHECK-ARM64: bl func_stack32 +; CHECK-ARM64-NOT: sub sp, sp, + ; CHECK-TAIL: bl func_stack32 ; CHECK-TAIL: sub sp, sp, #32 +; CHECK-ARM64-TAIL: bl func_stack32 +; CHECK-ARM64-TAIL: sub sp, sp, #32 + call fastcc void @func_stack0() ; CHECK: bl func_stack0 ; CHECK-NOT: sub sp, sp +; CHECK-ARM64: bl func_stack0 +; CHECK-ARM64-NOT: sub sp, sp + ; CHECK-TAIL: bl func_stack0 ; CHECK-TAIL-NOT: sub sp, sp +; CHECK-ARM64-TAIL: bl func_stack0 +; CHECK-ARM64-TAIL-NOT: sub sp, sp + ret void ; CHECK: add sp, sp, #48 ; CHECK-NEXT: ret +; CHECK-ARM64: mov sp, x29 +; CHECK-ARM64-NEXT: ldp x29, x30, [sp], #16 +; CHECK-ARM64-NEXT: ret + ; CHECK-TAIL: add sp, sp, #80 ; CHECK-TAIL-NEXT: ret + +; CHECK-ARM64-TAIL: mov sp, x29 +; CHECK-ARM64-TAIL-NEXT: ldp x29, x30, [sp], #16 +; CHECK-ARM64-TAIL-NEXT: ret } Index: test/CodeGen/AArch64/tail-call.ll =================================================================== --- test/CodeGen/AArch64/tail-call.ll +++ test/CodeGen/AArch64/tail-call.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck --check-prefix=CHECK-ARM64 %s declare fastcc void @callee_stack0() declare fastcc void @callee_stack8([8 x i32], i64) @@ -7,57 +8,92 @@ define fastcc void @caller_to0_from0() nounwind { ; CHECK-LABEL: caller_to0_from0: ; CHECK-NEXT: // BB + +; CHECK-ARM64-LABEL: caller_to0_from0: +; CHECK-ARM64-NEXT: // BB + tail call fastcc void @callee_stack0() ret void + ; CHECK-NEXT: b callee_stack0 + +; CHECK-ARM64-NEXT: b callee_stack0 } define fastcc void @caller_to0_from8([8 x i32], i64) { ; CHECK-LABEL: caller_to0_from8: +; CHECK-ARM64-LABEL: caller_to0_from8: + tail call fastcc void @callee_stack0() ret void + ; CHECK: add sp, sp, #16 ; CHECK-NEXT: b callee_stack0 + +; CHECK-ARM64: add sp, sp, #16 +; CHECK-ARM64-NEXT: b callee_stack0 } define fastcc void @caller_to8_from0() { ; CHECK-LABEL: caller_to8_from0: ; CHECK: sub sp, sp, #32 +; CHECK-ARM64-LABEL: caller_to8_from0: +; CHECK-ARM64: sub sp, sp, #32 + ; Key point is that the "42" should go #16 below incoming stack ; pointer (we didn't have arg space to reuse). tail call fastcc void @callee_stack8([8 x i32] undef, i64 42) ret void + ; CHECK: str {{x[0-9]+}}, [sp, #16] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: b callee_stack8 + +; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]! +; CHECK-ARM64-NEXT: b callee_stack8 } define fastcc void @caller_to8_from8([8 x i32], i64 %a) { ; CHECK-LABEL: caller_to8_from8: ; CHECK: sub sp, sp, #16 +; CHECK-ARM64-LABEL: caller_to8_from8: +; CHECK-ARM64: sub sp, sp, #16 + ; Key point is that the "%a" should go where at SP on entry. tail call fastcc void @callee_stack8([8 x i32] undef, i64 42) ret void + ; CHECK: str {{x[0-9]+}}, [sp, #16] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: b callee_stack8 + +; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]! +; CHECK-ARM64-NEXT: b callee_stack8 } define fastcc void @caller_to16_from8([8 x i32], i64 %a) { ; CHECK-LABEL: caller_to16_from8: ; CHECK: sub sp, sp, #16 +; CHECK-ARM64-LABEL: caller_to16_from8: +; CHECK-ARM64: sub sp, sp, #16 + ; Important point is that the call reuses the "dead" argument space ; above %a on the stack. If it tries to go below incoming-SP then the ; callee will not deallocate the space, even in fastcc. tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2) + ; CHECK: str {{x[0-9]+}}, [sp, #24] ; CHECK: str {{x[0-9]+}}, [sp, #16] -; CHECK: add sp, sp, #16 -; CHECK: b callee_stack16 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: b callee_stack16 + +; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] +; CHECK-ARM64-NEXT: add sp, sp, #16 +; CHECK-ARM64-NEXT: b callee_stack16 ret void } @@ -66,12 +102,19 @@ ; CHECK-LABEL: caller_to8_from24: ; CHECK: sub sp, sp, #16 +; CHECK-ARM64-LABEL: caller_to8_from24: +; CHECK-ARM64: sub sp, sp, #16 + ; Key point is that the "%a" should go where at #16 above SP on entry. tail call fastcc void @callee_stack8([8 x i32] undef, i64 42) ret void + ; CHECK: str {{x[0-9]+}}, [sp, #32] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: b callee_stack8 + +; CHECK-ARM64: str {{x[0-9]+}}, [sp, #32]! +; CHECK-ARM64-NEXT: b callee_stack8 } @@ -79,6 +122,9 @@ ; CHECK-LABEL: caller_to16_from16: ; CHECK: sub sp, sp, #16 +; CHECK-ARM64-LABEL: caller_to16_from16: +; CHECK-ARM64: sub sp, sp, #16 + ; Here we want to make sure that both loads happen before the stores: ; otherwise either %a or %b will be wrongly clobbered. tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a) @@ -89,6 +135,11 @@ ; CHECK: str x1, ; CHECK: str x0, -; CHECK: add sp, sp, #16 -; CHECK: b callee_stack16 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: b callee_stack16 + +; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] +; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] +; CHECK-ARM64-NEXT: add sp, sp, #16 +; CHECK-ARM64-NEXT: b callee_stack16 }