Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -693,6 +693,16 @@ const SmallVectorImpl &Ins, SelectionDAG& DAG) const; + bool + IsEligibleForTailCallOptimization_64SVR4( + SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, + SelectionDAG& DAG) const; + SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, int SPDiff, SDValue Chain, Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -19,6 +19,7 @@ #include "PPCTargetMachine.h" #include "PPCTargetObjectFile.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/CallingConvLower.h" @@ -36,12 +37,15 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; +#define DEBUG_TYPE "ppc-lowering" + // FIXME: Remove this once soft-float is supported. static cl::opt DisablePPCFloatInVariadic("disable-ppc-float-in-variadic", cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden); @@ -55,6 +59,12 @@ static cl::opt DisablePPCUnaligned("disable-ppc-unaligned", cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); +static cl::opt DisableSCO("disable-ppc-sco", +cl::desc("disable sibling call optimization on ppc"), cl::Hidden); + +STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumSiblingCalls, "Number of sibling calls"); + // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; @@ -3832,6 +3842,179 @@ return SPDiff; } +static bool isFunctionGlobalAddress(SDValue Callee); + +static bool +ResideInSameModule(SDValue Callee, Reloc::Model RelMod) { + // If !G, Callee can be an external symbol. + GlobalAddressSDNode *G = dyn_cast(Callee); + if (!G) return false; + + const GlobalValue *GV = G->getGlobal(); + + if (GV->isDeclaration()) return false; + + switch(GV->getLinkage()) { + default: llvm_unreachable("unknow linkage type"); + case GlobalValue::AvailableExternallyLinkage: + case GlobalValue::ExternalWeakLinkage: + return false; + + // Callee with weak linkage is allowed if it has hidden or protected + // visibility + case GlobalValue::LinkOnceAnyLinkage: + case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions + case GlobalValue::WeakAnyLinkage: + case GlobalValue::WeakODRLinkage: // e.g. c++ template instantiation + if (GV->hasDefaultVisibility()) + return false; + + case GlobalValue::ExternalLinkage: + case GlobalValue::InternalLinkage: + case GlobalValue::PrivateLinkage: + break; + } + + // With '-fPIC', calling default visiblity function need insert 'nop' after + // function call, no matter that function resides in same module or not, so + // we treat it as in different module. + if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility()) + return false; + + return true; +} + +static bool +NeedStackSlotPassParameters(const PPCSubtarget &Subtarget, + const SmallVectorImpl &Outs) { + assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); + + const unsigned PtrByteSize = 8; + const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + + static const MCPhysReg GPR[] = { + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + + const unsigned Num_GPR_Regs = array_lengthof(GPR); + const unsigned Num_FPR_Regs = 13; + const unsigned Num_VR_Regs = array_lengthof(VR); + const unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; + + unsigned NumBytes = LinkageSize; + unsigned AvailableFPRs = Num_FPR_Regs; + unsigned AvailableVRs = Num_VR_Regs; + + for (const ISD::OutputArg& Param : Outs) { + if (Param.Flags.isNest()) continue; + + if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs, + Subtarget.hasQPX())) + return true; + } + return false; +} + +bool +PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( + SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, + SelectionDAG& DAG) const { + bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; + + if (DisableSCO && !TailCallOpt) return false; + + // Variadic argument functions are not supported. + if (isVarArg) return false; + + MachineFunction &MF = DAG.getMachineFunction(); + CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); + + // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has + // same calling convention + if (CallerCC != CalleeCC) return false; + + // SCO support C calling convention + if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C) + return false; + + // Functions containing by val parameters are not supported. + for (const ISD::InputArg& IA : Ins) + if (IA.Flags.isByVal()) + return false; + + // No TCO/SCO on indirect call because Caller have to restore its TOC + if (!isFunctionGlobalAddress(Callee) && + !isa(Callee)) + return false; + + // Check if Callee resides in same module, because for now, PPC64 SVR4 ABI + // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another + // module. + // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 + if (!ResideInSameModule(Callee, getTargetMachine().getRelocationModel())) + return false; + + // TCO allows altering callee ABI, so we don't have to check further. + if (CalleeCC == CallingConv::Fast && TailCallOpt) + return true; + + if (DisableSCO) return false; + + if (NeedStackSlotPassParameters(Subtarget, Outs)) { + // TODO: Allow SCO if caller and callee has same function prototype + return false; + } + + // Handle struct return (i.e. return type is a struct) case. + if (Outs.size() && Outs[0].Flags.isSRet()) { + // Case we would like to check: + // Type callee() { return Type; } + // Type caller() { return callee() } + // + // If caller and callee use different struct-return pointer, we can't apply + // SCO on callee. + // + // Some background: if function return struct, LLVM will pass a struct- + // return pointer as 1st (hidden) parameter to function. i.e. + // + // define void @callee(%struct.Type* noalias nocapture sret %agg.result) + // define void @caller(%struct.Type* noalias nocapture sret %agg.result) + // + // PPC ABI also define the same rule. + // + // So we need to check if caller and callee use same pointer. + + // OutVals[0] means first argument, it stands for the hidden struct-return + // pointer, will looks like: + // t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0 + // + // Its Operand 1 is Register:i64 %vreg0, means vreg0 hold this pointer + assert(OutVals[0]->getOperand(1).getOpcode() == ISD::Register); + + RegisterSDNode *RSD = cast(OutVals[0]->getOperand(1)); + unsigned calleeArg0Vreg = RSD->getReg(); + + // Look at caller's 1st argument (struct-return pointer), get it's vreg id. + unsigned callerArg0Vreg = MF.getRegInfo().livein_begin()->second; + + if (callerArg0Vreg != calleeArg0Vreg) return false; + } + + return true; +} + /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. @@ -4469,9 +4652,31 @@ bool IsPatchPoint = CLI.IsPatchPoint; ImmutableCallSite *CS = CLI.CS; - if (isTailCall) - isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, - Ins, DAG); + if (isTailCall) { + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) + isTailCall = + IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, isVarArg, + Outs, OutVals, Ins, DAG); + else + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, + Ins, DAG); + if (isTailCall) { + ++NumTailCalls; + if (!getTargetMachine().Options.GuaranteedTailCallOpt) + ++NumSiblingCalls; + + assert(isa(Callee)); + DEBUG( + const GlobalValue *GV = cast(Callee)->getGlobal(); + const unsigned Width = 80 - strlen("TCO caller: ") + - strlen(", callee linkage: 0, 0"); + dbgs() << "TCO caller: " + << left_justify(DAG.getMachineFunction().getName(), Width) + << ", callee linkage: " + << GV->getVisibility() << ", " << GV->getLinkage() << "\n" + ); + } + } if (!isTailCall && CS && CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " @@ -4750,12 +4955,16 @@ bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); bool hasNest = false; + bool IsSibCall = false; EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned PtrByteSize = 8; MachineFunction &MF = DAG.getMachineFunction(); + if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt) + IsSibCall = true; + // Mark this function as potentially containing a function that contains a // tail call. As a consequence the frame pointer will be used for dynamicalloc // and restoring the callers stack pointer in this functions epilog. This is @@ -4875,9 +5084,12 @@ CallConv == CallingConv::Fast) NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); + int SPDiff = 0; + // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. - int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + if (!IsSibCall) + SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); // To protect arguments on the stack from being clobbered in a tail call, // force all the loads to happen before doing any other lowering. @@ -4886,8 +5098,9 @@ // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + if (!IsSibCall) + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getIntPtrConstant(NumBytes, dl, true), dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -5356,7 +5569,7 @@ InFlag = Chain.getValue(1); } - if (isTailCall) + if (isTailCall && !IsSibCall) PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); Index: test/CodeGen/PowerPC/ppc64-calls.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-calls.ll +++ test/CodeGen/PowerPC/ppc64-calls.ll @@ -14,7 +14,8 @@ define void @test_direct() nounwind readnone { ; CHECK-LABEL: test_direct: tail call void @foo() nounwind -; CHECK: bl foo +; Because of tail call optimization, it can be 'b' instruction. +; CHECK: [[BR:b[l]?]] foo ; CHECK-NOT: nop ret void } Index: test/CodeGen/PowerPC/ppc64-sibcall.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -0,0 +1,153 @@ +; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO +; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX + +%S_56 = type { [13 x i32], i32 } +%S_64 = type { [15 x i32], i32 } +%S_32 = type { [7 x i32], i32 } + +; Function Attrs: noinline nounwind +define void @callee_56_copy([7 x i64] %a.coerce, %S_56* %b) #0 { ret void } +define void @callee_64_copy([8 x i64] %a.coerce, %S_64* %b) #0 { ret void } + +; Function Attrs: nounwind +define void @caller_56_reorder_copy(%S_56* %b, [7 x i64] %a.coerce) #1 { + tail call void @callee_56_copy([7 x i64] %a.coerce, %S_56* %b) + ret void + +; CHECK-SCO-LABEL: caller_56_reorder_copy: +; CHECK-SCO-NOT: stdu 1 +; CHECK-SCO: TC_RETURNd8 callee_56_copy +} + +define void @caller_64_reorder_copy(%S_64* %b, [8 x i64] %a.coerce) #1 { + tail call void @callee_64_copy([8 x i64] %a.coerce, %S_64* %b) + ret void + +; CHECK-SCO-LABEL: caller_64_reorder_copy: +; CHECK-SCO: bl callee_64_copy +} + +define void @arg8_callee( + float %a, i32 signext %b, float %c, i32* %d, + i8 zeroext %e, float %f, i32* %g, i32 signext %h) +{ + ret void +} + +define void @arg8_caller(float %a, i32 signext %b, i8 zeroext %c, i32* %d) { +entry: + tail call void @arg8_callee(float undef, i32 signext undef, float undef, + i32* %d, i8 zeroext undef, float undef, + i32* undef, i32 signext undef) + ret void + +; CHECK-SCO-LABEL: arg8_caller: +; CHECK-SCO: b arg8_callee +} + +; Struct return test + +; Function Attrs: noinline nounwind +define void @callee_ret_56(%S_56* noalias sret %agg.result) #0 { ret void } +define void @callee_ret_32(%S_32* noalias sret %agg.result) #0 { ret void } + +; Function Attrs: nounwind +define void @caller_do_something_ret_32(%S_32* noalias sret %agg.result) #1 { + %1 = alloca %S_56, align 4 + %2 = bitcast %S_56* %1 to i8* + call void @callee_ret_56(%S_56* nonnull sret %1) + tail call void @callee_ret_32(%S_32* sret %agg.result) + ret void + +; CHECK-SCO-LABEL: caller_do_something_ret_32: +; CHECK-SCO: stdu 1 +; CHECK-SCO: bl callee_ret_56 +; CHECK-SCO: addi 1 +; CHECK-SCO: TC_RETURNd8 callee_ret_32 +} + +attributes #0 = { noinline nounwind } +attributes #1 = { nounwind } + +; vector <4 x i1> test + +define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void } +define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) { + tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b) + ret void + +; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't +; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder + +; CHECK-SCO-LABEL: caller_v4i1_reorder: +; CHECK-SCO: bl callee_v4i1 + +; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder: +; CHECK-SCO-HASQPX: b callee_v4i1 +} + +define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void } +define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { + tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) + ret void + +; CHECK-SCO-LABEL: f128_caller: +; CHECK-SCO: b f128_callee +} + +; weak linkage test +%class.T = type { [2 x i8] } + +define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void } +define void @wo_hcaller(%class.T* %this, i8* %c) { + tail call void @wo_hcallee(%class.T* %this, i8* %c) + ret void + +; CHECK-SCO-LABEL: wo_hcaller: +; CHECK-SCO: b wo_hcallee +} + +define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void } +define void @wo_pcaller(%class.T* %this, i8* %c) { + tail call void @wo_pcallee(%class.T* %this, i8* %c) + ret void + +; CHECK-SCO-LABEL: wo_pcaller: +; CHECK-SCO: b wo_pcallee +} + +define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void } +define void @wo_caller(%class.T* %this, i8* %c) { + tail call void @wo_callee(%class.T* %this, i8* %c) + ret void + +; CHECK-SCO-LABEL: wo_caller: +; CHECK-SCO: bl wo_callee +} + +define weak protected void @w_pcallee(i8* %ptr) { ret void } +define void @w_pcaller(i8* %ptr) { + tail call void @w_pcallee(i8* %ptr) + ret void + +; CHECK-SCO-LABEL: w_pcaller: +; CHECK-SCO: b w_pcallee +} + +define weak hidden void @w_hcallee(i8* %ptr) { ret void } +define void @w_hcaller(i8* %ptr) { + tail call void @w_hcallee(i8* %ptr) + ret void + +; CHECK-SCO-LABEL: w_hcaller: +; CHECK-SCO: b w_hcallee +} + +define weak void @w_callee(i8* %ptr) { ret void } +define void @w_caller(i8* %ptr) { + tail call void @w_callee(i8* %ptr) + ret void + +; CHECK-SCO-LABEL: w_caller: +; CHECK-SCO: bl w_callee +}