Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -155,6 +155,15 @@ unsigned getCatchReturnOpcode() const { return CatchRetOpcode; } unsigned getReturnOpcode() const { return ReturnOpcode; } + /// In the call sequence of A->B->C for example, we can help Shrink Wrapping + /// by allowing directly return from C to A if possible (e.g. tail call in B). + /// This method returns the opcode to call C from B if it is supported. + + /// If direct return is not supported on the target, it returns -1. + virtual unsigned getCallOpcodeForDirectReturn() const { return (unsigned)-1; } + /// Return true if this call instruction is eligible for direct return. + virtual bool isEligibleForDirectReturn(const MachineInstr &CI) const { return false; } + /// Returns the actual stack pointer adjustment made by an instruction /// as part of a call sequence. By default, only call frame setup/destroy /// instructions adjust the stack, but targets may want to override this Index: lib/CodeGen/ShrinkWrap.cpp =================================================================== --- lib/CodeGen/ShrinkWrap.cpp +++ lib/CodeGen/ShrinkWrap.cpp @@ -83,11 +83,17 @@ STATISTIC(NumCandidates, "Number of shrink-wrapping candidates"); STATISTIC(NumCandidatesDropped, "Number of shrink-wrapping candidates dropped because of frequency"); +STATISTIC(NumDirectReturn, + "Number of method calls applied direct return optimization"); static cl::opt EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden, cl::desc("enable the shrink-wrapping pass")); +static cl::opt + EnableDirectReturnOpt("shrink-wrap-direct-return-opt", cl::Hidden, cl::init(true), + cl::desc("enable the direct return in shrink wrapping")); + namespace { /// \brief Class to determine where the safe point to insert the /// prologue and epilogue are. @@ -128,12 +134,26 @@ mutable SetOfRegs CurrentCSRs; /// Current MachineFunction. MachineFunction *MachineFunc; + /// Call instructions that can directly return from the callee of + /// this method to the caller of this method (by skinpping this method). + SmallVector DirectReturnCandidates; /// \brief Check if \p MI uses or defines a callee-saved register or /// a frame index. If this is the case, this means \p MI must happen /// after Save and before Restore. bool useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const; + /// This function iterate over the basic blocks to find the candidates of + /// the direct return optimization. + /// For example, in the call sequence of A->B->C (B is the current method), + /// the direct return allows C to directly pass the control to A + /// when C returns. + /// To apply direct return, the returned value from C directly is passed + /// to A as the return value of B (i.e. no inst. between call and return). + /// Also, call of C must be with the internal linkage since direct return + /// does not comply with ABI. + void findDirectReturnCandidates(const TargetInstrInfo &TII, RegScavenger *RS); + const SetOfRegs &getCurrentCSRs(RegScavenger *RS) const { if (CurrentCSRs.empty()) { BitVector SavedRegs; @@ -171,6 +191,7 @@ FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); Entry = &MF.front(); CurrentCSRs.clear(); + DirectReturnCandidates.clear(); MachineFunc = &MF; ++NumFunc; @@ -220,6 +241,14 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const { + + // If DirectReturnCandidates is in the BB, + // Call frame creation can be skipped. + // Use or def of CSR has been already checked. + for (MachineInstr *CI : DirectReturnCandidates) + if (MI.getParent() == CI->getParent()) + return false; + if (MI.getOpcode() == FrameSetupOpcode || MI.getOpcode() == FrameDestroyOpcode) { DEBUG(dbgs() << "Frame instruction: " << MI << '\n'); @@ -418,6 +447,65 @@ return false; } +void +ShrinkWrap::findDirectReturnCandidates(const TargetInstrInfo &TII, RegScavenger *RS) { + for (MachineBasicBlock &MBB : *MachineFunc) { + DEBUG(dbgs() << "findDirectReturnCandidates: " << MBB.getNumber() << ' ' + << MBB.getName() << '\n'); + + // This BB is not a target if it has multiple or no successors + if (MBB.succ_size() != 1) + continue; + + // This BB is not a target if the successor is not a return block + // or if the successor includes instructions before the return + MachineBasicBlock *NextMBB = *(MBB.succ_begin()); + assert(NextMBB != NULL && + "By construction we must have one successor here"); + if (!NextMBB->isReturnBlock()) + continue; + + if (!NextMBB->getFirstNonDebugInstr()->getDesc().isReturn()) + continue; + + // We now investigate instructions in this BB + MachineInstr *CI = NULL; + for (MachineInstr &MI : MBB) { + if (MI.getDesc().isCall() && CI == NULL) { + // we can optimize only internal linkage + // since external linkers cannot handle direct return + if (!(MI.getOperand(0).isGlobal() && + MI.getOperand(0).getGlobal()->hasInternalLinkage())) + break; + + // we need to do platform-specific checks + if (!TII.isEligibleForDirectReturn(MI)) + break; + + CI = &MI; + continue; + } + + if (MI.getOpcode() == FrameSetupOpcode || + MI.getOpcode() == FrameDestroyOpcode) + continue; + + // If this BB accesses a CSR, or there is another instructions after call + // we cannot optimize this BB. + if ((CI != NULL && !MI.getDesc().isUnconditionalBranch()) || + useOrDefCSROrFI(MI, RS)) { + CI = NULL; + break; + } + } + if (CI != NULL) { + DEBUG(dbgs() << "Call instruction in this BB is selected " + "as a target of birect return\n"); + DirectReturnCandidates.push_back(CI); + } + } +} + bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { if (MF.empty() || !isShrinkWrapEnabled(MF)) return false; @@ -441,6 +529,13 @@ std::unique_ptr RS( TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr); + // We look for return targets before finding save/restore points. + // The direct return is supported, if DirectReturnCallOpcode != -1. + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + unsigned DirectReturnCallOpcode = TII.getCallOpcodeForDirectReturn(); + if (EnableDirectReturnOpt && DirectReturnCallOpcode != (unsigned)-1) + findDirectReturnCandidates(TII, RS.get()); + for (MachineBasicBlock &MBB : MF) { DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' ' << MBB.getName() << '\n'); @@ -467,6 +562,22 @@ break; } } + + if (!ArePointsInteresting() && !DirectReturnCandidates.empty()) { + // Shrink wrapping need to select save and restore points, + // but there is no more frame/CSR code due to direct return support. + // We create an unreachable BB and select it for save and restore points. + // This unreachable BB will be eliminated by control flow optimizer. + MachineBasicBlock *ReturnBlock = *(DirectReturnCandidates[0]->getParent()->succ_begin()); + MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(); + MF.push_back(NewMBB); + TII.insertUnconditionalBranch(*NewMBB, &*ReturnBlock, ReturnBlock->findBranchDebugLoc()); + NewMBB->addSuccessor(ReturnBlock); + updateSaveRestorePoints(*NewMBB, RS.get()); + DEBUG(dbgs() << "Created an unreachable BB#" << NewMBB->getNumber() + << " to select for creating stack frame\n"); + } + if (!ArePointsInteresting()) { // If the points are not interesting at this point, then they must be null // because it means we did not encounter any frame/CSR related code. @@ -510,6 +621,43 @@ updateSaveRestorePoints(*NewBB, RS.get()); } while (Save && Restore); + if (!DirectReturnCandidates.empty()) { + const MCInstrDesc &MCID = TII.get(DirectReturnCallOpcode); + for (MachineInstr *CandidateCI : DirectReturnCandidates) { + MachineBasicBlock *MBB = CandidateCI->getParent(); + // If the candidate is after the stack save, we don't modify it + if (MDT->dominates(Save, MBB)) { + assert(MPDT->dominates(Restore, MBB) && + "inconsistent dominance information"); + continue; + } + + DEBUG(dbgs() << "The call in " << MBB->getName() + << " is modified for direct return\n"); + // We replace the opcode of the call by the special one, + // then eliminate unnecessary instructions. + // The control never go back from the call. + CandidateCI->setDesc(MCID); + if (MBB->back().getDesc().isBranch()) + MBB->pop_back(); + MBB->removeSuccessor(MBB->succ_begin()); + assert(MBB->succ_empty() && + "The candidate MBB has only one succrssor and it must be removed."); + MachineInstr *FSetupMI = NULL, *FDestroyMI = NULL; + for (MachineInstr &MI : *MBB) { + if (MI.getOpcode() == FrameSetupOpcode) + FSetupMI = &MI; + if (MI.getOpcode() == FrameDestroyOpcode) + FDestroyMI = &MI; + } + assert((FSetupMI && FDestroyMI) && "We cannot find frame instructions"); + MBB->erase(FSetupMI); + MBB->erase(FDestroyMI); + + NumDirectReturn++; + } + } + if (!ArePointsInteresting()) { ++NumCandidatesDropped; return false; Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -5235,6 +5235,14 @@ if (Flags.isInConsecutiveRegsLast()) NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } + PPCFunctionInfo *FuncInfo = MF.getInfo(); + if (CallConv != CallingConv::Fast) { + FuncInfo->setHasParameterArea(HasParameterArea); + } else { + bool FitInRegs = (NumGPRsUsed <= NumGPRs && NumVRsUsed <= NumVRs && + NumFPRsUsed <= NumFPRs); + FuncInfo->setHasParameterArea(!FitInRegs); + } unsigned NumBytesActuallyUsed = NumBytes; Index: lib/Target/PowerPC/PPCInstr64Bit.td =================================================================== --- lib/Target/PowerPC/PPCInstr64Bit.td +++ lib/Target/PowerPC/PPCInstr64Bit.td @@ -140,6 +140,11 @@ (outs), (ins abscalltarget:$func), "bla $func\n\tnop", IIC_BrB, [(PPCcall_nop (i64 imm:$func))]>; + + // This opcode is for method call without setting link register + // used for direct return optimization in shrink wrapping + def B_CALL : IForm<18, 0, 0, (outs), (ins calltarget:$func), + "b $func", IIC_BrB, []>; } let Uses = [CTR8, RM] in { def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), Index: lib/Target/PowerPC/PPCInstrInfo.h =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.h +++ lib/Target/PowerPC/PPCInstrInfo.h @@ -107,6 +107,9 @@ public: explicit PPCInstrInfo(PPCSubtarget &STI); + unsigned getCallOpcodeForDirectReturn() const override; + bool isEligibleForDirectReturn (const MachineInstr &CI) const override; + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). Index: lib/Target/PowerPC/PPCInstrInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.cpp +++ lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1931,3 +1931,33 @@ return &PPC::VSRCRegClass; return RC; } + +unsigned PPCInstrInfo::getCallOpcodeForDirectReturn() const { + if (Subtarget.isPPC64() && Subtarget.isELFv2ABI()) return PPC::B_CALL; + return -1; +} + +bool PPCInstrInfo::isEligibleForDirectReturn (const MachineInstr &CI) const { + assert(CI.getDesc().isCall() && + isa(CI.getOperand(0).getGlobal()) && + "isEligibleForDirectReturn requires call instrcution"); + assert(Subtarget.isPPC64() && Subtarget.isELFv2ABI() && + "We only support direct return on ELFv2 for PPC64"); + + const Function *CalleeFunc = dyn_cast(CI.getOperand(0).getGlobal()); + const MachineFunction *CallerMF = CI.getParent()->getParent(); + + // If this method returns another datatype, we cannot optimize. + if (!CallerMF->getFunction()->getReturnType()->isVoidTy() && + CallerMF->getFunction()->getReturnType() != CalleeFunc->getReturnType()) + return false; + + // A function call that requires parameter area exists in this method, + // so we do not eliminate stack creation. + // FIXME: It is better to check the use of parameter area of this callee + if (CallerMF->getInfo()->hasParameterArea()) + return false; + assert(!CalleeFunc->isVarArg() && "Vararg function uses parameter area"); + + return true; +} Index: lib/Target/PowerPC/PPCMachineFunctionInfo.h =================================================================== --- lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -113,6 +113,9 @@ /// copies bool IsSplitCSR = false; + /// True if this method has parameter save area in stack frame + bool HasParameterArea = false; + public: explicit PPCFunctionInfo(MachineFunction &MF) : MF(MF) {} @@ -188,6 +191,12 @@ bool isSplitCSR() const { return IsSplitCSR; } void setIsSplitCSR(bool s) { IsSplitCSR = s; } + void setHasParameterArea(bool s) { + // The current method has parameter area if one of callees requires it. + HasParameterArea |= s; + } + bool hasParameterArea() const { return HasParameterArea; } + MCSymbol *getPICOffsetSymbol() const; MCSymbol *getGlobalEPSymbol() const; Index: test/CodeGen/PowerPC/shrinkwrap_direct_return.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/shrinkwrap_direct_return.ll @@ -0,0 +1,121 @@ +; RUN: llc -verify-machineinstrs -O1 -mcpu=ppc64 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s + +define i64 @func1(i64 %a) { +; Since both functions calls are internal, we don't need to create/destroy stack frame at all in func1. +; CHECK-LABEL: func1: +; CHECK-NOT: mflr +; CHECK-DAG: b internal_func1 +; CHECK-DAG: b internal_func2 +; CHECK-NOT: mtlr + +check1: + %and = and i64 %a, 1 + %tobool = icmp eq i64 %and, 0 + br i1 %tobool, label %check2, label %if.then + +check2: + %and1 = and i64 %a, 2 + %tobool2 = icmp eq i64 %and1, 0 + br i1 %tobool2, label %return, label %if.then2 + +return: + %retval = phi i64 [ %call, %if.then ], [ %call2, %if.then2 ], [ %a, %check2 ] + ret i64 %retval + +if.then: + %call = tail call fastcc i64 @internal_func1(i64 %a) + br label %return + +if.then2: + %call2 = tail call fastcc i64 @internal_func2(i64 %a) + br label %return +} + + + +define i64 @func2(i64 %a) { +; Only internal function can be optimized with direct return. We create/destroy stack frame around external_func1. +; CHECK-LABEL: func2: +; CHECK-DAG: mflr +; CHECK-DAG: bl external_func1 +; CHECK-DAG: b internal_func1 +; CHECK-DAG: mtlr + +check1: + %and = and i64 %a, 1 + %tobool = icmp eq i64 %and, 0 + br i1 %tobool, label %check2, label %if.then + +check2: + %and1 = and i64 %a, 2 + %tobool2 = icmp eq i64 %and1, 0 + br i1 %tobool2, label %return, label %if.then2 + +return: + %retval = phi i64 [ %call, %if.then ], [ %call2, %if.then2 ], [ %a, %check2 ] + ret i64 %retval + +if.then: + %call = tail call fastcc i64 @internal_func1(i64 %a) + br label %return + +if.then2: + %call2 = tail call i64 @external_func1(i64 %a) + br label %return +} + + + +define i64 @func3(i64 %a) { +; Since a method call for vararg exists in func3, we do not optimize with direct return. +; Hence, we create stack frame in the method prologue and destroy it in epilogue. +; CHECK-LABEL: func3: +; CHECK: mflr +; CHECK-DAG: bl internal_func1 +; CHECK-DAG: bl internal_vararg_func +; CHECK: mtlr + +check1: + %and = and i64 %a, 1 + %tobool = icmp eq i64 %and, 0 + br i1 %tobool, label %check2, label %if.then + +check2: + %and1 = and i64 %a, 2 + %tobool2 = icmp eq i64 %and1, 0 + br i1 %tobool2, label %return, label %if.then2 + +return: + %retval = phi i64 [ %call, %if.then ], [ %call2, %if.then2 ], [ %a, %check2 ] + ret i64 %retval + +if.then: + %call = tail call fastcc i64 @internal_func1(i64 %a) + br label %return + +if.then2: + %call2 = tail call i64 (i64, ...) @internal_vararg_func(i64 %a) + br label %return +} + + + + +define internal fastcc i64 @internal_func1(i64 %a) unnamed_addr #1 { +check1: + ret i64 %a +} + +define internal fastcc i64 @internal_func2(i64 %a) unnamed_addr #1 { +check1: + ret i64 %a +} + +define internal i64 @internal_vararg_func(i64 %a, ...) unnamed_addr #1 { +check1: + ret i64 %a +} + +declare i64 @external_func1(i64) #1 + +attributes #1 = { noinline }