Index: llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1541,13 +1541,14 @@ // 3) A function does not use the TOC pointer R2 but does have calls. // In this case st_other=1 since we do not know whether or not any // of the callees clobber R2. This case is dealt with in this else if - // block. + // block. Tail calls are considered calls and the st_other should also + // be set to 1 in that case as well. // 4) The function does not use the TOC pointer but R2 is used inside // the function. In this case st_other=1 once again. // 5) This function uses inline asm. We mark R2 as reserved if the function // has inline asm so we have to assume that it may be used. - if (MF->getFrameInfo().hasCalls() || MF->hasInlineAsm() || - (!PPCFI->usesTOCBasePtr() && UsesX2OrR2)) { + if (MF->getFrameInfo().hasCalls() || MF->getFrameInfo().hasTailCall() || + MF->hasInlineAsm() || (!PPCFI->usesTOCBasePtr() && UsesX2OrR2)) { PPCTargetStreamer *TS = static_cast(OutStreamer->getTargetStreamer()); if (TS) Index: llvm/lib/Target/PowerPC/PPCFrameLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1687,13 +1687,25 @@ DebugLoc dl = MBBI->getDebugLoc(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); - // Create branch instruction for pseudo tail call return instruction + // Create branch instruction for pseudo tail call return instruction. + // The TCRETURNdi variants are direct calls. Valid targets for those are + // MO_GlobalAddress operands as well as MO_ExternalSymbol with PC-Rel + // since we can tail call external functions with PC-Rel (i.e. don't need to + // worry about different TOC pointers). Some of the external funcions will + // be MO_GlobalAddress while others (like memcpy for example) are going to + // be MO_ExternalSymbol. unsigned RetOpcode = MBBI->getOpcode(); if (RetOpcode == PPC::TCRETURNdi) { MBBI = MBB.getLastNonDebugInstr(); MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + if (JumpTarget.isGlobal()) + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + else if (JumpTarget.isSymbol()) + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). + addExternalSymbol(JumpTarget.getSymbolName()); + else + llvm_unreachable("Expecting Global or External Symbol"); } else if (RetOpcode == PPC::TCRETURNri) { MBBI = MBB.getLastNonDebugInstr(); assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); @@ -1705,8 +1717,14 @@ } else if (RetOpcode == PPC::TCRETURNdi8) { MBBI = MBB.getLastNonDebugInstr(); MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + if (JumpTarget.isGlobal()) + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + else if (JumpTarget.isSymbol()) + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). + addExternalSymbol(JumpTarget.getSymbolName()); + else + llvm_unreachable("Expecting Global or External Symbol"); } else if (RetOpcode == PPC::TCRETURNri8) { MBBI = MBB.getLastNonDebugInstr(); assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -4685,6 +4685,12 @@ static bool hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) { + // If the call site does not have a valid instruction pointer we don't have + // enough information to determine if we have the same argument list. + // We return false just to be safe. + if (!CS.getInstruction()) + return false; + if (CS.arg_size() != CallerFn->arg_size()) return false; @@ -4742,16 +4748,6 @@ SelectionDAG& DAG) const { bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; - // FIXME: Tail calls are currently disabled when using PC Relative addressing. - // The issue is that PC Relative is only partially implemented and so there - // is currently a mix of functions that require the TOC and functions that do - // not require it. If we have A calls B calls C and both A and B require the - // TOC and C does not and is marked as clobbering R2 then it is not safe for - // B to tail call C. Since we do not have the information of whether or not - // a funciton needs to use the TOC here in this function we need to be - // conservatively safe and disable all tail calls for now. - if (Subtarget.isUsingPCRelativeCalls()) return false; - if (DisableSCO && !TailCallOpt) return false; // Variadic argument functions are not supported. @@ -4791,15 +4787,23 @@ needStackSlotPassParameters(Subtarget, Outs)) return false; - // No TCO/SCO on indirect call because Caller have to restore its TOC - if (!isFunctionGlobalAddress(Callee) && - !isa(Callee)) + // All variants of 64-bit ELF ABIs without PC-Relative addressing + // require that the caller and callee share the same TOC for + // TCO/SCO. If the caller and callee potentially have different TOC bases + // then we cannot tail call since we need to restore the TOC pointer after + // the call. + // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 + // We cannot guarantee this for indirect calls or calls to external + // functions. When PC-Relative addressing is used, the concept of the TOC is + // no longer applicable so this check is not required. + // Check first for indirect calls. + if (!Subtarget.isUsingPCRelativeCalls() && + !isFunctionGlobalAddress(Callee) && !isa(Callee)) return false; - // If the caller and callee potentially have different TOC bases then we - // cannot tail call since we need to restore the TOC pointer after the call. - // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 - if (!callsShareTOCBase(&Caller, Callee, getTargetMachine())) + // Check if we share the TOC base. + if (!Subtarget.isUsingPCRelativeCalls() && + !callsShareTOCBase(&Caller, Callee, getTargetMachine())) return false; // TCO allows altering callee ABI, so we don't have to check further. @@ -5506,13 +5510,17 @@ // Emit tail call. if (CFlags.IsTailCall) { + // Indirect tail call when using PC Relative calls do not have the same + // constraints. assert(((Callee.getOpcode() == ISD::Register && cast(Callee)->getReg() == PPC::CTR) || Callee.getOpcode() == ISD::TargetExternalSymbol || Callee.getOpcode() == ISD::TargetGlobalAddress || - isa(Callee)) && + isa(Callee) || + (CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) && "Expecting a global address, external symbol, absolute value or " "register"); + // PC Relative calls also use TC_RETURN as the way to mark tail calls. assert(CallOpc == PPCISD::TC_RETURN && "Unexpected call opcode for a tail call."); DAG.getMachineFunction().getFrameInfo().setHasTailCall(); @@ -5571,17 +5579,19 @@ if (!getTargetMachine().Options.GuaranteedTailCallOpt) ++NumSiblingCalls; - assert(isa(Callee) && + // PC Relative calls no longer guarantee that the callee is a Global + // Address Node. The callee could be an indirect tail call in which + // case the SDValue for the callee could be a load (to load the address + // of a function pointer) or it may be a register copy (to move the + // address of the callee from a function parameter into a virtual + // register). It may also be an ExternalSymbolSDNode (ex memcopy). + assert((Subtarget.isUsingPCRelativeCalls() || + isa(Callee)) && "Callee should be an llvm::Function object."); - LLVM_DEBUG( - const GlobalValue *GV = - cast(Callee)->getGlobal(); - const unsigned Width = - 80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0"); - dbgs() << "TCO caller: " - << left_justify(DAG.getMachineFunction().getName(), Width) - << ", callee linkage: " << GV->getVisibility() << ", " - << GV->getLinkage() << "\n"); + + LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName() + << "\nTCO callee: "); + LLVM_DEBUG(Callee.dump()); } } Index: llvm/lib/Target/PowerPC/PPCMCInstLower.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -86,14 +86,22 @@ RefKind = MCSymbolRefExpr::VK_PPC_GOT_PCREL; const MachineInstr *MI = MO.getParent(); - - if (MI->getOpcode() == PPC::BL8_NOTOC) - RefKind = MCSymbolRefExpr::VK_PPC_NOTOC; - const MachineFunction *MF = MI->getMF(); const Module *M = MF->getFunction().getParent(); const PPCSubtarget *Subtarget = &(MF->getSubtarget()); const TargetMachine &TM = Printer.TM; + + unsigned MIOpcode = MI->getOpcode(); + assert((Subtarget->isUsingPCRelativeCalls() || MIOpcode != PPC::BL8_NOTOC) && + "BL8_NOTOC is only valid when using PC Relative Calls."); + if (Subtarget->isUsingPCRelativeCalls()) { + if (MIOpcode == PPC::TAILB || MIOpcode == PPC::TAILB8 || + MIOpcode == PPC::TCRETURNdi || MIOpcode == PPC::TCRETURNdi8 || + MIOpcode == PPC::BL8_NOTOC) { + RefKind = MCSymbolRefExpr::VK_PPC_NOTOC; + } + } + const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx); // If -msecure-plt -fPIC, add 32768 to symbol. if (Subtarget->isSecurePlt() && TM.isPositionIndependent() && Index: llvm/test/CodeGen/PowerPC/pcrel-got-indirect.ll =================================================================== --- llvm/test/CodeGen/PowerPC/pcrel-got-indirect.ll +++ llvm/test/CodeGen/PowerPC/pcrel-got-indirect.ll @@ -215,20 +215,13 @@ define dso_local void @ReadFuncPtr() local_unnamed_addr { ; CHECK-LABEL: ReadFuncPtr: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: mflr r0 -; CHECK-NEXT: std r0, 16(r1) -; CHECK-NEXT: stdu r1, -32(r1) -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK: .localentry ReadFuncPtr, 1 +; CHECK-NEXT: # %bb.0: # %entry ; CHECK-NEXT: pld r3, ptrfunc@got@pcrel(0), 1 -; CHECK-NEXT: ld r3, 0(r3) -; CHECK-NEXT: mtctr r3 -; CHECK-NEXT: bctrl -; CHECK-NEXT: addi r1, r1, 32 -; CHECK-NEXT: ld r0, 16(r1) -; CHECK-NEXT: mtlr r0 -; CHECK-NEXT: blr +; CHECK-NEXT: ld r12, 0(r3) +; CHECK-NEXT: mtctr r12 +; CHECK-NEXT: bctr +; CHECK-NEXT: #TC_RETURNr8 ctr 0 entry: %0 = load void ()*, void ()** bitcast (void (...)** @ptrfunc to void ()**), align 8 tail call void %0() Index: llvm/test/CodeGen/PowerPC/pcrel-tail-calls.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/PowerPC/pcrel-tail-calls.ll @@ -0,0 +1,237 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s + +; The tests check the behaviour of PC Relative tail calls. When using +; PC Relative we are able to do more tail calling than we have done in +; the past as we no longer need to restore the TOC pointer into R2 after +; most calls. + +@Func = external local_unnamed_addr global i32 (...)*, align 8 +@FuncLocal = common dso_local local_unnamed_addr global i32 (...)* null, align 8 + +; No calls in this function but we assign the function pointers. +define dso_local void @AssignFuncPtr() local_unnamed_addr { +; CHECK-LABEL: AssignFuncPtr: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pld r3, Func@got@pcrel(0), 1 +; CHECK-NEXT: pld r4, Function@got@pcrel(0), 1 +; CHECK-NEXT: std r4, 0(r3) +; CHECK-NEXT: pstd r4, FuncLocal@PCREL(0), 1 +; CHECK-NEXT: blr +entry: + store i32 (...)* @Function, i32 (...)** @Func, align 8 + store i32 (...)* @Function, i32 (...)** @FuncLocal, align 8 + ret void +} + +declare signext i32 @Function(...) + +define dso_local void @TailCallLocalFuncPtr() local_unnamed_addr { +; CHECK-LABEL: TailCallLocalFuncPtr: +; CHECK: .localentry TailCallLocalFuncPtr, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: pld r12, FuncLocal@PCREL(0), 1 +; CHECK-NEXT: mtctr r12 +; CHECK-NEXT: bctr +; CHECK-NEXT: #TC_RETURNr8 ctr 0 +entry: + %0 = load i32 ()*, i32 ()** bitcast (i32 (...)** @FuncLocal to i32 ()**), align 8 + %call = tail call signext i32 %0() + ret void +} + +define dso_local void @TailCallExtrnFuncPtr() local_unnamed_addr { +; CHECK-LABEL: TailCallExtrnFuncPtr: +; CHECK: .localentry TailCallExtrnFuncPtr, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: pld r3, Func@got@pcrel(0), 1 +; CHECK-NEXT: ld r12, 0(r3) +; CHECK-NEXT: mtctr r12 +; CHECK-NEXT: bctr +; CHECK-NEXT: #TC_RETURNr8 ctr 0 +entry: + %0 = load i32 ()*, i32 ()** bitcast (i32 (...)** @Func to i32 ()**), align 8 + %call = tail call signext i32 %0() + ret void +} + +define dso_local signext i32 @TailCallParamFuncPtr(i32 (...)* nocapture %passedfunc) local_unnamed_addr { +; CHECK-LABEL: TailCallParamFuncPtr: +; CHECK: .localentry TailCallParamFuncPtr, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: mr r12, r3 +; CHECK-NEXT: bctr +; CHECK-NEXT: #TC_RETURNr8 ctr 0 +entry: + %callee.knr.cast = bitcast i32 (...)* %passedfunc to i32 ()* + %call = tail call signext i32 %callee.knr.cast() + ret i32 %call +} + +define dso_local signext i32 @NoTailIndirectCall(i32 (...)* nocapture %passedfunc, i32 signext %a) local_unnamed_addr { +; CHECK-LABEL: NoTailIndirectCall: +; CHECK: .localentry NoTailIndirectCall, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset r30, -16 +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -48(r1) +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: mr r12, r3 +; CHECK-NEXT: mr r30, r4 +; CHECK-NEXT: bctrl +; CHECK-NEXT: add r3, r3, r30 +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: addi r1, r1, 48 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +entry: + %callee.knr.cast = bitcast i32 (...)* %passedfunc to i32 ()* + %call = tail call signext i32 %callee.knr.cast() + %add = add nsw i32 %call, %a + ret i32 %add +} + +define dso_local signext i32 @TailCallDirect() local_unnamed_addr { +; CHECK-LABEL: TailCallDirect: +; CHECK: .localentry TailCallDirect, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: b Function@notoc +; CHECK-NEXT: #TC_RETURNd8 Function@notoc 0 +entry: + %call = tail call signext i32 bitcast (i32 (...)* @Function to i32 ()*)() + ret i32 %call +} + +define dso_local signext i32 @NoTailCallDirect(i32 signext %a) local_unnamed_addr { +; CHECK-LABEL: NoTailCallDirect: +; CHECK: .localentry NoTailCallDirect, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset r30, -16 +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -48(r1) +; CHECK-NEXT: mr r30, r3 +; CHECK-NEXT: bl Function@notoc +; CHECK-NEXT: add r3, r3, r30 +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: addi r1, r1, 48 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +entry: + %call = tail call signext i32 bitcast (i32 (...)* @Function to i32 ()*)() + %add = add nsw i32 %call, %a + ret i32 %add +} + +define dso_local signext i32 @TailCallDirectLocal() local_unnamed_addr { +; CHECK-LABEL: TailCallDirectLocal: +; CHECK: .localentry TailCallDirectLocal, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: b LocalFunction@notoc +; CHECK-NEXT: #TC_RETURNd8 LocalFunction@notoc 0 +entry: + %call = tail call fastcc signext i32 @LocalFunction() + ret i32 %call +} + +define dso_local signext i32 @NoTailCallDirectLocal(i32 signext %a) local_unnamed_addr { +; CHECK-LABEL: NoTailCallDirectLocal: +; CHECK: .localentry NoTailCallDirectLocal, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset r30, -16 +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -48(r1) +; CHECK-NEXT: mr r30, r3 +; CHECK-NEXT: bl LocalFunction@notoc +; CHECK-NEXT: add r3, r3, r30 +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: addi r1, r1, 48 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +entry: + %call = tail call fastcc signext i32 @LocalFunction() + %add = add nsw i32 %call, %a + ret i32 %add +} + +define dso_local signext i32 @TailCallAbs() local_unnamed_addr { +; CHECK-LABEL: TailCallAbs: +; CHECK: .localentry TailCallAbs, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: li r3, 400 +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r12, 400 +; CHECK-NEXT: bctr +; CHECK-NEXT: #TC_RETURNr8 ctr 0 +entry: + %call = tail call signext i32 inttoptr (i64 400 to i32 ()*)() + ret i32 %call +} + +define dso_local signext i32 @NoTailCallAbs(i32 signext %a) local_unnamed_addr { +; CHECK-LABEL: NoTailCallAbs: +; CHECK: .localentry NoTailCallAbs, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset r30, -16 +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -48(r1) +; CHECK-NEXT: mr r30, r3 +; CHECK-NEXT: li r3, 400 +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r12, 400 +; CHECK-NEXT: bctrl +; CHECK-NEXT: add r3, r3, r30 +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: addi r1, r1, 48 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr +entry: + %call = tail call signext i32 inttoptr (i64 400 to i32 ()*)() + %add = add nsw i32 %call, %a + ret i32 %add +} + +; Function Attrs: noinline +; This function should be tail called and not inlined. +define internal fastcc signext i32 @LocalFunction() unnamed_addr #0 { +; CHECK-LABEL: LocalFunction: +; CHECK: .localentry LocalFunction, 1 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: li r3, 42 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: blr +entry: + %0 = tail call i32 asm "li $0, 42", "=&r"() + ret i32 %0 +} + +attributes #0 = { noinline } +