Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -631,10 +631,12 @@ } if (MI.isCall()) { - assert(MI.getOpcode() == AMDGPU::SI_CALL); // Pseudo used just to encode the underlying global. Is there a better // way to track this? - const Function *Callee = cast(MI.getOperand(2).getGlobal()); + + const MachineOperand *CalleeOp + = TII->getNamedOperand(MI, AMDGPU::OpName::callee); + const Function *Callee = cast(CalleeOp->getGlobal()); if (Callee->isDeclaration()) { // If this is a call to an external function, we can't do much. Make // conservative guesses. Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -172,6 +172,11 @@ const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + SDValue addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo &MFI, + int ClobberedFI) const; + SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals, StringRef Reason) const; @@ -291,6 +296,7 @@ // Function call. CALL, + TC_RETURN, TRAP, // Masked control flow nodes. Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1001,6 +1001,42 @@ return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); } +SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo &MFI, + int ClobberedFI) const { + SmallVector ArgChains; + int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; + + // Include the original chain at the beginning of the list. When this is + // used by target LowerCall hooks, this helps legalize find the + // CALLSEQ_BEGIN node. + ArgChains.push_back(Chain); + + // Add a chain value for each stack argument corresponding + for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), + UE = DAG.getEntryNode().getNode()->use_end(); + U != UE; ++U) { + if (LoadSDNode *L = dyn_cast(*U)) { + if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) { + if (FI->getIndex() < 0) { + int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); + int64_t InLastByte = InFirstByte; + InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; + + if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || + (FirstByte <= InFirstByte && InFirstByte <= LastByte)) + ArgChains.push_back(SDValue(L, 1)); + } + } + } + } + + // Build a tokenfactor for all the chains. + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} + SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals, StringRef Reason) const { @@ -3658,6 +3694,7 @@ NODE_NAME_CASE(ELSE) NODE_NAME_CASE(LOOP) NODE_NAME_CASE(CALL) + NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(TRAP) NODE_NAME_CASE(RET_FLAG) NODE_NAME_CASE(RETURN_TO_EPILOG) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -74,6 +74,8 @@ [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] >; +def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -98,6 +100,10 @@ SDNPVariadic] >; +def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -146,6 +146,9 @@ OutMI.addOperand(Dest); OutMI.addOperand(Src); return; + } else if (Opcode == AMDGPU::SI_TCRETURN) { + // TODO: How to use branch immediate and avoid register+add? + Opcode = AMDGPU::S_SETPC_B64; } int MCOpcode = TII->pseudoToMCOpcode(Opcode); Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -224,6 +224,15 @@ const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; + + bool mayBeEmittedAsTailCall(const CallInst *) const override; + + bool isEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, SelectionDAG &DAG) const; + SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -32,6 +32,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" @@ -84,6 +85,10 @@ using namespace llvm; +#define DEBUG_TYPE "si-lower" + +STATISTIC(NumTailCalls, "Number of tail calls"); + static cl::opt EnableVGPRIndexMode( "amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), @@ -1647,6 +1652,9 @@ DAG.getPass()->getAnalysis(); ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo()); + unsigned StackArgSize = CCInfo.getNextStackOffset(); + Info->setBytesInStackArgArea(StackArgSize); + return Chains.empty() ? Chain : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } @@ -1955,6 +1963,103 @@ } } +static bool canGuaranteeTCO(CallingConv::ID CC) { + return CC == CallingConv::Fast; +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +bool SITargetLowering::isEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, SelectionDAG &DAG) const { + if (!mayTailCallThisCC(CalleeCC)) + return false; + + MachineFunction &MF = DAG.getMachineFunction(); + const Function *CallerF = MF.getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + + // Kernels aren't callable, and don't have a live in return address so it + // doesn't make sense to do a tail call with entry functions. + if (!CallerPreserved) + return false; + + bool CCMatch = CallerCC == CalleeCC; + + if (DAG.getTarget().Options.GuaranteedTailCallOpt) { + if (canGuaranteeTCO(CalleeCC) && CCMatch) + return true; + return false; + } + + // TODO: Can we handle var args? + if (IsVarArg) + return false; + + for (const Argument &Arg : CallerF->args()) { + if (Arg.hasByValAttr()) + return false; + } + + LLVMContext &Ctx = *DAG.getContext(); + + // Check that the call results are passed in the same way. + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins, + CCAssignFnForCall(CalleeCC, IsVarArg), + CCAssignFnForCall(CallerCC, IsVarArg))) + return false; + + // The callee has to preserve all registers the caller needs to preserve. + if (!CCMatch) { + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) + return false; + } + + // Nothing more to check if the callee is taking no arguments. + if (Outs.empty()) + return true; + + SmallVector ArgLocs; + CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); + + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg)); + + const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + // TODO: Is this really necessary? + if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) + return false; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals); +} + +bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { + if (!CI->isTailCall()) + return false; + + const Function *ParentFn = CI->getParent()->getParent(); + if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) + return false; + + auto Attr = ParentFn->getFnAttribute("disable-tail-calls"); + return (Attr.getValueAsString() != "true"); +} + // The wave scratch offset register is used as the global base pointer. SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -1987,8 +2092,27 @@ "unsupported required tail call to function "); } - // TODO: Implement tail calls. - IsTailCall = false; + // The first 4 bytes are reserved for the callee's emergency stack slot. + const unsigned CalleeUsableStackOffset = 4; + + if (IsTailCall) { + IsTailCall = isEligibleForTailCallOptimization( + Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); + if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) { + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + } + + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + + // A sibling call is one where we're under the usual C ABI and not planning + // to change that but can still do a tail call: + if (!TailCallOpt && IsTailCall) + IsSibCall = true; + + if (IsTailCall) + ++NumTailCalls; + } if (GlobalAddressSDNode *GA = dyn_cast(Callee)) { // FIXME: Remove this hack for function pointer types. @@ -2020,8 +2144,8 @@ // by this amount for a tail call. In a sibling call it must be 0 because the // caller will deallocate the entire stack and the callee still expects its // arguments to begin at SP+0. Completely unused for non-tail calls. - int FPDiff = 0; - + int32_t FPDiff = 0; + MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector, 8> RegsToPass; // Adjust the stack pointer for the new arguments... @@ -2044,9 +2168,7 @@ // Stack pointer relative accesses are done by changing the offset SGPR. This // is just the VGPR offset component. - - // The first 4 bytes are reserved for the callee's emergency stack slot. - SDValue StackPtr = DAG.getConstant(4, DL, MVT::i32); + SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32); SmallVector MemOpChains; MVT PtrVT = MVT::i32; @@ -2093,10 +2215,28 @@ SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32); PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); - if (!IsTailCall) { - SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32); + if (IsTailCall) { + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + unsigned OpSize = Flags.isByVal() ? + Flags.getByValSize() : VA.getValVT().getStoreSize(); + + Offset = Offset + FPDiff; + int FI = MFI.CreateFixedObject(OpSize, Offset, true); + + DstAddr = DAG.getFrameIndex(FI, PtrVT); + DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, DstAddr, StackPtr); + DstInfo = MachinePointerInfo::getFixedStack(MF, FI); - DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + // Make sure any stack arguments overlapping with where we're storing + // are loaded before this eventual operation. Otherwise they'll be + // clobbered. + + // FIXME: Why is this really necessary? This seems to just result in a + // lot of code to copy the stack and write them back to the same + // locations, which are supposed to be immutable? + Chain = addTokenForArgument(Chain, DAG, MFI, FI); + } else { + DstAddr = PtrOff; DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); } @@ -2132,6 +2272,22 @@ InFlag = Chain.getValue(1); } + + SDValue PhysReturnAddrReg; + if (IsTailCall) { + // Since the return is being combined with the call, we need to pass on the + // return address. + + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + SDValue ReturnAddrReg = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); + + PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), + MVT::i64); + Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag); + InFlag = Chain.getValue(1); + } + // We don't usually want to end the call-sequence here because we would tidy // the frame up *after* the call, however in the ABI-changing tail-call case // we've carefully laid out the parameters so that when sp is reset they'll be @@ -2153,6 +2309,8 @@ // this information must travel along with the operation for eventual // consumption by emitEpilogue. Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); + + Ops.push_back(PhysReturnAddrReg); } // Add argument registers to the end of the list so that they are known live @@ -2177,8 +2335,8 @@ // If we're doing a tall call, use a TC_RETURN here rather than an // actual call instruction. if (IsTailCall) { - MF.getFrameInfo().setHasTailCall(); - llvm_unreachable("not implemented"); + MFI.setHasTailCall(); + return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. @@ -2873,7 +3031,8 @@ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); return BB; } - case AMDGPU::SI_CALL_ISEL: { + case AMDGPU::SI_CALL_ISEL: + case AMDGPU::SI_TCRETURN_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); @@ -2885,17 +3044,24 @@ const GlobalValue *G = PCRel->getOperand(1).getGlobal(); - MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) - .add(MI.getOperand(0)) - .addGlobalAddress(G); + MachineInstrBuilder MIB; + if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) + .add(MI.getOperand(0)) + .addGlobalAddress(G); + } else { + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) + .add(MI.getOperand(0)) + .addGlobalAddress(G); + + // There is an additional imm operand for tcreturn, but it should be in the + // right place already. + } for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); - MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - MI.eraseFromParent(); return BB; } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -378,6 +378,31 @@ (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { let Size = 4; let isCall = 1; + let UseNamedOperandTable = 1; + let SchedRW = [WriteBranch]; +} + +// Tail call handling pseudo +def SI_TCRETURN_ISEL : SPseudoInstSI<(outs), + (ins SSrc_b64:$src0, i32imm:$fpdiff), + [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> { + let isCall = 1; + let isTerminator = 1; + let isReturn = 1; + let isBarrier = 1; + let SchedRW = [WriteBranch]; + let usesCustomInserter = 1; +} + +def SI_TCRETURN : SPseudoInstSI < + (outs), + (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> { + let Size = 4; + let isCall = 1; + let isTerminator = 1; + let isReturn = 1; + let isBarrier = 1; + let UseNamedOperandTable = 1; let SchedRW = [WriteBranch]; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -110,6 +110,17 @@ unsigned PSInputAddr = 0; unsigned PSInputEnable = 0; + /// Number of bytes of arguments this function has on the stack. If the callee + /// is expected to restore the argument stack this should be a multiple of 16, + /// all usable during a tail call. + /// + /// The alternative would forbid tail call optimisation in some cases: if we + /// want to transfer control from a function with 8-bytes of stack-argument + /// space to a function with 16-bytes then misalignment of this value would + /// make a stack adjustment necessary, which could not be undone by the + /// callee. + unsigned BytesInStackArgArea = 0; + bool ReturnsVoid = true; // A pair of default/requested minimum/maximum flat work group sizes. @@ -235,6 +246,14 @@ unsigned getTIDReg() const { return TIDReg; } void setTIDReg(unsigned Reg) { TIDReg = Reg; } + unsigned getBytesInStackArgArea() const { + return BytesInStackArgArea; + } + + void setBytesInStackArgArea(unsigned Bytes) { + BytesInStackArgArea = Bytes; + } + // Add user SGPRs. unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); unsigned addDispatchPtr(const SIRegisterInfo &TRI); Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -0,0 +1,225 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s + +; GCN-LABEL: {{^}}i32_fastcc_i32_i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_setpc_b64 +define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { + %add0 = add i32 %arg0, %arg1 + ret i32 %add0 +} + +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: +define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + ret i32 %ret +} + +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: +; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 +; GCN: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { +entry: + %alloca = alloca [16 x i32], align 4 + %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 + store volatile i32 9, i32* %gep + %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + ret i32 %ret +} + +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result: +define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + ret void +} + +; It doesn't make sense to do a tail from a kernel +; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result: +;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + ret void +} + +; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: +; GCN: s_waitcnt +; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32* byval align 4 %arg1) #1 { + %arg1.load = load i32, i32* %arg1, align 4 + %add0 = add i32 %arg0, %arg1.load + ret i32 %add0 +} + +; Tail call disallowed with byval in parent. +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; GCN: s_swappc_b64 +; GCN: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32* byval %b.byval, i32 %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* %b.byval) + ret i32 %ret +} + +; Tail call disallowed with byval in parent, not callee. +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: +; GCN-NOT: v0 +; GCN-NOT: s32 +; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16 +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4 +; GCN-NEXT: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* inttoptr (i32 16 to i32*)) + ret i32 %ret +} + +; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: v_add_i32_e32 v0, vcc, v1, v0 +; GCN: v_add_i32_e32 v0, vcc, [[LOAD_0]], v0 +; GCN: v_add_i32_e32 v0, vcc, [[LOAD_1]], v0 +; GCN-NEXT: s_setpc_b64 +define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { + %val_firststack = extractvalue [32 x i32] %large, 30 + %val_laststack = extractvalue [32 x i32] %large, 31 + %add0 = add i32 %arg0, %arg1 + %add1 = add i32 %add0, %val_firststack + %add2 = add i32 %add1, %val_laststack + ret i32 %add2 +} + +; FIXME: Why load and store same location for stack args? +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: +; GCN: s_mov_b32 s5, s32 + +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill + +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 + +; GCN-NOT: s32 + +; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8 + +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload + +; GCN-NOT: s32 +; GCN: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) + ret i32 %ret +} + +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; GCN-DAG: s_mov_b32 s5, s32 +; GCN-NOT: s32 +; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44 + +; GCN-NOT: s32 +; GCN: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { +entry: + %alloca = alloca [16 x i32], align 4 + %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 + store volatile i32 9, i32* %gep + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) + ret i32 %ret +} + +; If the callee requires more stack argument space than the caller, +; don't do a tail call. +; TODO: Do we really need this restriction? + +; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space: +; GCN: s_swappc_b64 +; GCN: s_setpc_b64 +define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) + ret i32 %ret +} + +; Have another non-tail in the function +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v34, s33, 0 +; GCN-DAG: v_writelane_b32 v34, s34, 1 +; GCN-DAG: v_writelane_b32 v34, s35, 2 +; GCN-DAG: s_add_u32 s32, s32, 0x400 + +; GCN: s_getpc_b64 +; GCN: s_swappc_b64 + +; GCN: s_getpc_b64 s[6:7] +; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 + +; GCN-DAG: v_readlane_b32 s33, v34, 0 +; GCN-DAG: v_readlane_b32 s34, v34, 1 +; GCN-DAG: v_readlane_b32 s35, v34, 2 + +; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12 +; GCN: s_sub_u32 s32, s32, 0x400 +; GCN: s_setpc_b64 s[6:7] +define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { +entry: + %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) + ret i32 %ret +} + +; Have stack object in caller and stack passed arguments. SP should be +; in same place at function exit. + +; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; GCN: s_mov_b32 s5, s32 +; GCN-NOT: s32 +; GCN: s_setpc_b64 s[6:7] +define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +entry: + %alloca = alloca [16 x i32], align 4 + %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 + store volatile i32 9, i32* %gep + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) + ret i32 %ret +} + +; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; GCN: s_mov_b32 s5, s32 +; GCN-NOT: s32 +; GCN: s_setpc_b64 s[6:7] +define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { +entry: + %alloca = alloca [16 x i32], align 4 + %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 + store volatile i32 9, i32* %gep + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) + ret i32 %ret +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind noinline } Index: test/CodeGen/AMDGPU/tail-call-cgp.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/tail-call-cgp.ll @@ -0,0 +1,43 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -codegenprepare %s | FileCheck %s + +define internal fastcc void @callee(i32* nocapture %p, i32 %a) #0 { + store volatile i32 %a, i32* %p, align 4 + ret void +} + +; CHECK-LABEL: @func_caller( +; CHECK: tail call fastcc void @callee( +; CHECK-NEXT: ret void +; CHECK: ret void +define void @func_caller(i32* nocapture %p, i32 %a, i32 %b) #0 { +entry: + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %bb, label %ret + +bb: + tail call fastcc void @callee(i32* %p, i32 %a) + br label %ret + +ret: + ret void +} + +; CHECK-LABEL: @kernel_caller( +; CHECK: tail call fastcc void @callee( +; CHECK-NEXT: br label %ret + +; CHECK: ret void +define amdgpu_kernel void @kernel_caller(i32* nocapture %p, i32 %a, i32 %b) #0 { +entry: + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %bb, label %ret + +bb: + tail call fastcc void @callee(i32* %p, i32 %a) + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind }