Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -713,6 +713,16 @@ const SmallVectorImpl &Ins, SelectionDAG& DAG) const; + bool + IsEligibleForTailCallOptimization_64SVR4( + SDValue Callee, + CallingConv::ID CalleeCC, + ImmutableCallSite *CS, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &Ins, + SelectionDAG& DAG) const; + SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, int SPDiff, SDValue Chain, Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -19,6 +19,7 @@ #include "PPCTargetMachine.h" #include "PPCTargetObjectFile.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/CallingConvLower.h" @@ -36,12 +37,15 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; +#define DEBUG_TYPE "ppc-lowering" + static cl::opt DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); @@ -51,6 +55,12 @@ static cl::opt DisablePPCUnaligned("disable-ppc-unaligned", cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); +static cl::opt DisableSCO("disable-ppc-sco", +cl::desc("disable sibling call optimization on ppc"), cl::Hidden); + +STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumSiblingCalls, "Number of sibling calls"); + // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; @@ -3842,6 +3852,176 @@ return SPDiff; } +static bool isFunctionGlobalAddress(SDValue Callee); + +static bool +resideInSameModule(SDValue Callee, Reloc::Model RelMod) { + // If !G, Callee can be an external symbol. + GlobalAddressSDNode *G = dyn_cast(Callee); + if (!G) return false; + + const GlobalValue *GV = G->getGlobal(); + + if (GV->isDeclaration()) return false; + + switch(GV->getLinkage()) { + default: llvm_unreachable("unknow linkage type"); + case GlobalValue::AvailableExternallyLinkage: + case GlobalValue::ExternalWeakLinkage: + return false; + + // Callee with weak linkage is allowed if it has hidden or protected + // visibility + case GlobalValue::LinkOnceAnyLinkage: + case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions + case GlobalValue::WeakAnyLinkage: + case GlobalValue::WeakODRLinkage: // e.g. c++ template instantiation + if (GV->hasDefaultVisibility()) + return false; + + case GlobalValue::ExternalLinkage: + case GlobalValue::InternalLinkage: + case GlobalValue::PrivateLinkage: + break; + } + + // With '-fPIC', calling default visiblity function need insert 'nop' after + // function call, no matter that function resides in same module or not, so + // we treat it as in different module. + if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility()) + return false; + + return true; +} + +static bool +needStackSlotPassParameters(const PPCSubtarget &Subtarget, + const SmallVectorImpl &Outs) { + assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); + + const unsigned PtrByteSize = 8; + const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + + static const MCPhysReg GPR[] = { + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + + const unsigned NumGPRs = array_lengthof(GPR); + const unsigned NumFPRs = 13; + const unsigned NumVRs = array_lengthof(VR); + const unsigned ParamAreaSize = NumGPRs * PtrByteSize; + + unsigned NumBytes = LinkageSize; + unsigned AvailableFPRs = NumFPRs; + unsigned AvailableVRs = NumVRs; + + for (const ISD::OutputArg& Param : Outs) { + if (Param.Flags.isNest()) continue; + + if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs, + Subtarget.hasQPX())) + return true; + } + return false; +} + +static bool +hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { + if (CS->arg_size() != CallerFn->getArgumentList().size()) + return false; + + ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin(); + ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end(); + Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); + + for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { + const Value* CalleeArg = *CalleeArgIter; + const Value* CallerArg = &(*CallerArgIter); + if (CalleeArg == CallerArg) + continue; + + // e.g. @caller([4 x i64] %a, [4 x i64] %b) { + // tail call @callee([4 x i64] undef, [4 x i64] %b) + // } + // 1st argument of callee is undef and has the same type as caller. + if (CalleeArg->getType() == CallerArg->getType() && + isa(CalleeArg)) + continue; + + return false; + } + + return true; +} + +bool +PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( + SDValue Callee, + CallingConv::ID CalleeCC, + ImmutableCallSite *CS, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &Ins, + SelectionDAG& DAG) const { + bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; + + if (DisableSCO && !TailCallOpt) return false; + + // Variadic argument functions are not supported. + if (isVarArg) return false; + + MachineFunction &MF = DAG.getMachineFunction(); + CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); + + // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has + // the same calling convention + if (CallerCC != CalleeCC) return false; + + // SCO support C calling convention + if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C) + return false; + + // Functions containing by val parameters are not supported. + if (std::any_of(Ins.begin(), Ins.end(), + [](const ISD::InputArg& IA) { return IA.Flags.isByVal(); })) + return false; + + // No TCO/SCO on indirect call because Caller have to restore its TOC + if (!isFunctionGlobalAddress(Callee) && + !isa(Callee)) + return false; + + // Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI + // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another + // module. + // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 + if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel())) + return false; + + // TCO allows altering callee ABI, so we don't have to check further. + if (CalleeCC == CallingConv::Fast && TailCallOpt) + return true; + + if (DisableSCO) return false; + + // If callee use the same argument list that caller is using, then we can + // apply SCO on this case. If it is not, then we need to check if callee needs + // stack for passing arguments. + if (!hasSameArgumentList(MF.getFunction(), CS) && + needStackSlotPassParameters(Subtarget, Outs)) { + return false; + } + + return true; +} + /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. @@ -4479,9 +4659,32 @@ bool IsPatchPoint = CLI.IsPatchPoint; ImmutableCallSite *CS = CLI.CS; - if (isTailCall) - isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, - Ins, DAG); + if (isTailCall) { + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) + isTailCall = + IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS, + isVarArg, Outs, Ins, DAG); + else + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, + Ins, DAG); + if (isTailCall) { + ++NumTailCalls; + if (!getTargetMachine().Options.GuaranteedTailCallOpt) + ++NumSiblingCalls; + + assert(isa(Callee) && + "Callee should be an llvm::Function object."); + DEBUG( + const GlobalValue *GV = cast(Callee)->getGlobal(); + const unsigned Width = 80 - strlen("TCO caller: ") + - strlen(", callee linkage: 0, 0"); + dbgs() << "TCO caller: " + << left_justify(DAG.getMachineFunction().getName(), Width) + << ", callee linkage: " + << GV->getVisibility() << ", " << GV->getLinkage() << "\n" + ); + } + } if (!isTailCall && CS && CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " @@ -4760,12 +4963,16 @@ bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); bool hasNest = false; + bool IsSibCall = false; EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned PtrByteSize = 8; MachineFunction &MF = DAG.getMachineFunction(); + if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt) + IsSibCall = true; + // Mark this function as potentially containing a function that contains a // tail call. As a consequence the frame pointer will be used for dynamicalloc // and restoring the callers stack pointer in this functions epilog. This is @@ -4885,9 +5092,12 @@ CallConv == CallingConv::Fast) NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); + int SPDiff = 0; + // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. - int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + if (!IsSibCall) + SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); // To protect arguments on the stack from being clobbered in a tail call, // force all the loads to happen before doing any other lowering. @@ -4896,8 +5106,9 @@ // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + if (!IsSibCall) + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getIntPtrConstant(NumBytes, dl, true), dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -5366,7 +5577,7 @@ InFlag = Chain.getValue(1); } - if (isTailCall) + if (isTailCall && !IsSibCall) PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); Index: test/CodeGen/PowerPC/ppc64-calls.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-calls.ll +++ test/CodeGen/PowerPC/ppc64-calls.ll @@ -14,7 +14,8 @@ define void @test_direct() nounwind readnone { ; CHECK-LABEL: test_direct: tail call void @foo() nounwind -; CHECK: bl foo +; Because of tail call optimization, it can be 'b' instruction. +; CHECK: [[BR:b[l]?]] foo ; CHECK-NOT: nop ret void } Index: test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu --enable-shrink-wrap=false | FileCheck %s -check-prefix=CHECK-SCO-ONLY +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu --enable-shrink-wrap=true | FileCheck %s -check-prefix=CHECK-SCO-SHRK +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu --enable-shrink-wrap=false | FileCheck %s -check-prefix=CHECK-SCO-ONLY +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu --enable-shrink-wrap=true | FileCheck %s -check-prefix=CHECK-SCO-SHRK + +%"class.clang::NamedDecl" = type { i32 } +declare void @__assert_fail(); + +define i8 @_ZNK5clang9NamedDecl23getLinkageAndVisibilityEv( + %"class.clang::NamedDecl"* %this) { +entry: + %tobool = icmp eq %"class.clang::NamedDecl"* %this, null + br i1 %tobool, label %cond.false, label %exit + +cond.false: + tail call void @__assert_fail() + unreachable + +exit: + %DeclKind = getelementptr inbounds + %"class.clang::NamedDecl", + %"class.clang::NamedDecl"* %this, i64 0, i32 0 + %bf.load = load i32, i32* %DeclKind, align 4 + %call.i = tail call i8 @LVComputationKind( + %"class.clang::NamedDecl"* %this, + i32 %bf.load) + ret i8 %call.i + +; CHECK-SCO-SHRK-LABEL: _ZNK5clang9NamedDecl23getLinkageAndVisibilityEv: +; CHECK-SCO-SHRK: b LVComputationKind +; CHECK-SCO-SHRK: #TC_RETURNd8 +; CHECK-SCO-SHRK: stdu 1, -{{[0-9]+}}(1) +; CHECK-SCO-SHRK: bl __assert_fail +; +; CHECK-SCO-ONLY-LABEL: _ZNK5clang9NamedDecl23getLinkageAndVisibilityEv: +; CHECK-SCO-ONLY: stdu 1, -{{[0-9]+}}(1) +; CHECK-SCO-ONLY: b LVComputationKind +; CHECK-SCO-ONLY: #TC_RETURNd8 +; CHECK-SCO-ONLY: bl __assert_fail +} + +define fastcc i8 @LVComputationKind( + %"class.clang::NamedDecl"* %D, + i32 %computation) { + ret i8 0 +} Index: test/CodeGen/PowerPC/ppc64-sibcall.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -0,0 +1,191 @@ +; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO +; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX +; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX + +; No combination of "powerpc64le-unknown-linux-gnu" + "CHECK-SCO", because +; only Power8 (and later) fully support LE. + +%S_56 = type { [13 x i32], i32 } +%S_64 = type { [15 x i32], i32 } +%S_32 = type { [7 x i32], i32 } + +; Function Attrs: noinline nounwind +define void @callee_56_copy([7 x i64] %a, %S_56* %b) #0 { ret void } +define void @callee_64_copy([8 x i64] %a, %S_64* %b) #0 { ret void } + +; Function Attrs: nounwind +define void @caller_56_reorder_copy(%S_56* %b, [7 x i64] %a) #1 { + tail call void @callee_56_copy([7 x i64] %a, %S_56* %b) + ret void + +; CHECK-SCO-LABEL: caller_56_reorder_copy: +; CHECK-SCO-NOT: stdu 1 +; CHECK-SCO: TC_RETURNd8 callee_56_copy +} + +define void @caller_64_reorder_copy(%S_64* %b, [8 x i64] %a) #1 { + tail call void @callee_64_copy([8 x i64] %a, %S_64* %b) + ret void + +; CHECK-SCO-LABEL: caller_64_reorder_copy: +; CHECK-SCO: bl callee_64_copy +} + +define void @callee_64_64_copy([8 x i64] %a, [8 x i64] %b) #0 { ret void } +define void @caller_64_64_copy([8 x i64] %a, [8 x i64] %b) #1 { + tail call void @callee_64_64_copy([8 x i64] %a, [8 x i64] %b) + ret void + +; CHECK-SCO-LABEL: caller_64_64_copy: +; CHECK-SCO: b callee_64_64_copy +} + +define void @caller_64_64_reorder_copy([8 x i64] %a, [8 x i64] %b) #1 { + tail call void @callee_64_64_copy([8 x i64] %b, [8 x i64] %a) + ret void + +; CHECK-SCO-LABEL: caller_64_64_reorder_copy: +; CHECK-SCO: bl callee_64_64_copy +} + +define void @caller_64_64_undef_copy([8 x i64] %a, [8 x i64] %b) #1 { + tail call void @callee_64_64_copy([8 x i64] %a, [8 x i64] undef) + ret void + +; CHECK-SCO-LABEL: caller_64_64_undef_copy: +; CHECK-SCO: b callee_64_64_copy +} + +define void @arg8_callee( + float %a, i32 signext %b, float %c, i32* %d, + i8 zeroext %e, float %f, i32* %g, i32 signext %h) +{ + ret void +} + +define void @arg8_caller(float %a, i32 signext %b, i8 zeroext %c, i32* %d) { +entry: + tail call void @arg8_callee(float undef, i32 signext undef, float undef, + i32* %d, i8 zeroext undef, float undef, + i32* undef, i32 signext undef) + ret void + +; CHECK-SCO-LABEL: arg8_caller: +; CHECK-SCO: b arg8_callee +} + +; Struct return test + +; Function Attrs: noinline nounwind +define void @callee_sret_56(%S_56* noalias sret %agg.result) #0 { ret void } +define void @callee_sret_32(%S_32* noalias sret %agg.result) #0 { ret void } + +; Function Attrs: nounwind +define void @caller_do_something_sret_32(%S_32* noalias sret %agg.result) #1 { + %1 = alloca %S_56, align 4 + %2 = bitcast %S_56* %1 to i8* + call void @callee_sret_56(%S_56* nonnull sret %1) + tail call void @callee_sret_32(%S_32* sret %agg.result) + ret void + +; CHECK-SCO-LABEL: caller_do_something_sret_32: +; CHECK-SCO: stdu 1 +; CHECK-SCO: bl callee_sret_56 +; CHECK-SCO: addi 1 +; CHECK-SCO: TC_RETURNd8 callee_sret_32 +} + +define void @caller_local_sret_32(%S_32* %a) #1 { + %tmp = alloca %S_32, align 4 + tail call void @callee_sret_32(%S_32* nonnull sret %tmp) + ret void + +; CHECK-SCO-LABEL: caller_local_sret_32: +; CHECK-SCO: bl callee_sret_32 +} + +attributes #0 = { noinline nounwind } +attributes #1 = { nounwind } + +; vector <4 x i1> test + +define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void } +define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) { + tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b) + ret void + +; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't +; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder + +; CHECK-SCO-LABEL: caller_v4i1_reorder: +; CHECK-SCO: bl callee_v4i1 + +; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder: +; CHECK-SCO-HASQPX: b callee_v4i1 +} + +define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void } +define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { + tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) + ret void + +; CHECK-SCO-LABEL: f128_caller: +; CHECK-SCO: b f128_callee +} + +; weak linkage test +%class.T = type { [2 x i8] } + +define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void } +define void @wo_hcaller(%class.T* %this, i8* %c) { + tail call void @wo_hcallee(%class.T* %this, i8* %c) + ret void + +; CHECK-SCO-LABEL: wo_hcaller: +; CHECK-SCO: b wo_hcallee +} + +define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void } +define void @wo_pcaller(%class.T* %this, i8* %c) { + tail call void @wo_pcallee(%class.T* %this, i8* %c) + ret void + +; CHECK-SCO-LABEL: wo_pcaller: +; CHECK-SCO: b wo_pcallee +} + +define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void } +define void @wo_caller(%class.T* %this, i8* %c) { + tail call void @wo_callee(%class.T* %this, i8* %c) + ret void + +; CHECK-SCO-LABEL: wo_caller: +; CHECK-SCO: bl wo_callee +} + +define weak protected void @w_pcallee(i8* %ptr) { ret void } +define void @w_pcaller(i8* %ptr) { + tail call void @w_pcallee(i8* %ptr) + ret void + +; CHECK-SCO-LABEL: w_pcaller: +; CHECK-SCO: b w_pcallee +} + +define weak hidden void @w_hcallee(i8* %ptr) { ret void } +define void @w_hcaller(i8* %ptr) { + tail call void @w_hcallee(i8* %ptr) + ret void + +; CHECK-SCO-LABEL: w_hcaller: +; CHECK-SCO: b w_hcallee +} + +define weak void @w_callee(i8* %ptr) { ret void } +define void @w_caller(i8* %ptr) { + tail call void @w_callee(i8* %ptr) + ret void + +; CHECK-SCO-LABEL: w_caller: +; CHECK-SCO: bl w_callee +}