Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -22,6 +22,7 @@ PPCISelDAGToDAG.cpp PPCISelLowering.cpp PPCEarlyReturn.cpp + PPCEnableShrinkWrap.cpp PPCFastISel.cpp PPCFrameLowering.cpp PPCLoopDataPrefetch.cpp Index: lib/Target/PowerPC/PPC.h =================================================================== --- lib/Target/PowerPC/PPC.h +++ lib/Target/PowerPC/PPC.h @@ -27,6 +27,7 @@ class FunctionPass; class ImmutablePass; class MachineInstr; + class ModulePass; class AsmPrinter; class MCInst; @@ -46,27 +47,29 @@ FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); + ModulePass *createPPCEnableShrinkWrapPass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); void initializePPCVSXFMAMutatePass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); + void initializePPCEnableShrinkWrapPass(PassRegistry&); extern char &PPCVSXFMAMutateID; namespace PPCII { - + /// Target Operand Flag enum. enum TOF { //===------------------------------------------------------------------===// // PPC Specific MachineOperand flags. MO_NO_FLAG, - + /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the /// reference is actually to the "FOO$stub" or "FOO@plt" symbol. This is /// used for calls and jumps to external functions on Tiger and earlier, and /// for PIC calls on Linux and ELF systems. MO_PLT_OR_STUB = 1, - + /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to /// the function's picbase, e.g. lo16(symbol-picbase). MO_PIC_FLAG = 2, @@ -74,7 +77,7 @@ /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase). MO_NLP_FLAG = 4, - + /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a /// symbol with hidden visibility. This causes a different kind of /// non-lazy-pointer to be generated. @@ -100,7 +103,7 @@ MO_TLS = 8 << 4 }; } // end namespace PPCII - + } // end namespace llvm; #endif Index: lib/Target/PowerPC/PPCEnableShrinkWrap.cpp =================================================================== --- lib/Target/PowerPC/PPCEnableShrinkWrap.cpp +++ lib/Target/PowerPC/PPCEnableShrinkWrap.cpp @@ -0,0 +1,270 @@ +//===- PPCEnableShrinkWrap.cpp - split functions to create shrink wrapping +// opportunities ==/ +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Shrink wrapping functions can improve performance for functions when the +// stack is unused. However, all functions where value live-ranges contain +// function calls are ineligible for shrink wrapping. Why? +// +// 1. To avoid saving and restoring values across call-sites, LLVM allocates +// values with live-ranges that cross call sites to callee-saved registers +// +// 2. If callee-saved registers are used, they must be saved (usually in the +// function prolog) +// +// 3. ShrinkWrapping scans from the beginning of the function prolog to the +// first use of the stack. If a callee-saved register is saved to the stack in +// the function prolog, ShrinkWrapping will give up early. +// +// To increase the applicability of ShrinkWrapping, this pass identifies +// instances where a live-range straddling a call-site prevents ShrinkWrapping, +// and splits the original function into a stackless ShrinkWrappable stub that +// potentially makes a tail call to the remainder of the function. +// +// This transformation harms debuggability and consequently is not suitable for +// lower -O levels. +// +//===----------------------------------------------------------------------===// +#include "PPC.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Pass.h" + +using namespace llvm; + +static cl:: +opt DisablePreIncPrep("disable-ppc-enable-shrink-wrap", cl::Hidden, + cl::desc("Disable PPC enable shrink wrap pass")); + +class PPCEnableShrinkWrap : public ModulePass { + + static BasicBlock *getUniqueExitBlock(Function &F) { + BasicBlock *ExitBlock = nullptr; + for (auto &BB : F) { + if (isa(BB.getTerminator())) { + if (ExitBlock) + return nullptr; + else + ExitBlock = &BB; + } + } + return ExitBlock; + } + + static bool branchesTo(TerminatorInst *Term, BasicBlock *Succ) { + for (unsigned i = 0; i < Term->getNumSuccessors(); ++i) + if (Term->getSuccessor(i) == Succ) + return true; + return false; + } + + static bool canBeComputedWithoutStack(Value *V) { + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return true; + if (isa(V)) + return false; + if (IntrinsicInst *I = dyn_cast(V)) + return + I->getIntrinsicID() == Intrinsic::lifetime_end || + I->getIntrinsicID() == Intrinsic::lifetime_start; + if (isa(V)) + return true; + if (isa(V)) + return false; + // Some casts and extracts require a stack slot for proc <= P7 + // Conservatively assume all casts and extracts use the stack, since there + // is no easy way to query this information from the Target + if (isa(V) || isa(V)) + return false; + + Instruction *I = dyn_cast(V); + if (!I) + return false; + if (I->mayHaveSideEffects()) + return false; + if (I->mayReadFromMemory()) + return false; + + for (unsigned i = 0; i < I->getNumOperands(); ++i) + if (!canBeComputedWithoutStack(I->getOperand(i))) + return false; + + return true; + } + + static bool canBeComputedWithoutStack(BasicBlock *exitBlock, + BasicBlock &entry) { + for (auto &I : *exitBlock) { + if (PHINode *Phi = dyn_cast(&I)) { + if (!canBeComputedWithoutStack(Phi->getIncomingValueForBlock(&entry))) + return false; + } else if (!canBeComputedWithoutStack(&I)) { + return false; + } + } + return true; + } + + static bool hasCallInst(Function &F) { + for (auto &BB : F) + for (auto &I : BB) + if (isa(I) && !isa(I)) + return true; + return false; + } + + static bool hasVariadicCall(Function &F) { + for (auto &BB : F) + for (auto &I : BB) + if (CallInst *CI = dyn_cast(&I)) + if (CI->getFunctionType()->isVarArg()) + return true; + return false; + } + + static bool mayHaveSideEffects(BasicBlock &BB) { + for (auto &I : BB) + if (I.mayHaveSideEffects()) + return true; + return false; + } + +public: + static char ID; + PPCEnableShrinkWrap() : ModulePass(ID) { + initializePPCEnableShrinkWrapPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + ModulePass::getAnalysisUsage(AU); + } + + bool runOnModule(Module &M) { + bool Changed = false; + if (DisablePreIncPrep) return Changed; + SmallVector Funcs; + for (auto &F : M) + Funcs.emplace_back(&F); + for (auto &F : Funcs) + if (!F->isDeclaration()) + Changed |= runOnFunction(*F); + return Changed; + } + + static bool runOnFunction(Function &F) { + // Presently, only implemented for non-variadic functions that don't call + // variadic functions and that contain at least one callsite. + if (F.isVarArg()) + return false; + if (!hasCallInst(F)) + return false; + if (hasVariadicCall(F)) + return false; + + // Only unique exit blocks are handled + BasicBlock *ExitBlock = getUniqueExitBlock(F); + if (!ExitBlock) + return false; + + // Look for the pattern of entry blocks that immediately branch to exit + // where the entry block and the exit block can be computed without the + // stack. + BasicBlock &Entry = F.getEntryBlock(); + BranchInst *EntryTerm = dyn_cast(Entry.getTerminator()); + if (!EntryTerm) + return false; + if (EntryTerm->isUnconditional()) + return false; + if (isa(EntryTerm->getCondition())) + return false; + if (!branchesTo(EntryTerm, ExitBlock)) + return false; + if (!canBeComputedWithoutStack(EntryTerm)) + return false; + if (!canBeComputedWithoutStack(ExitBlock, Entry)) + return false; + if (mayHaveSideEffects(Entry)) + return false; + + // Clone the original function + ValueToValueMapTy VMap; + ClonedCodeInfo CodeInfo; + Function *CloneF = CloneFunction(&F, VMap, true, &CodeInfo); + CloneF->setName(F.getName() + ".cont"); + CloneF->setLinkage(GlobalValue::InternalLinkage); + F.getParent()->getFunctionList().push_back(CloneF); + + // Replace the non-exit block successor of the entry block that makes a tail + // call to the cloned version of the function. + LLVMContext &C = F.getContext(); + BasicBlock *TailCallBB = BasicBlock::Create(C, "tailCallBB", &F); + for (unsigned i = 0; i < EntryTerm->getNumSuccessors(); ++i) { + BasicBlock *Succ = EntryTerm->getSuccessor(i); + if (Succ != ExitBlock) { + for (auto I = Succ->begin(); &*I != Succ->getFirstNonPHI(); ++I) { + PHINode *Phi = cast(I); + Phi->removeIncomingValue(&Entry); + } + EntryTerm->setSuccessor(i, TailCallBB); + } + } + // Create the tail call + SmallVector Args; + for (auto &Arg : F.args()) + Args.emplace_back(&Arg); + CallInst *TailCall = CallInst::Create(CloneF, Args, "", TailCallBB); + TailCall->setTailCall(true); + TailCall->setCallingConv(CloneF->getCallingConv()); + // Return the result of the tail call if necessary. + if (F.getReturnType()->isVoidTy()) + ReturnInst::Create(C, TailCallBB); + else + ReturnInst::Create(C, TailCall, TailCallBB); + + // In the clone function, we know the non-early exit path will be taken + BranchInst *CloneEntryTerm = cast(VMap[EntryTerm]); + Value *Cond = CloneEntryTerm->getSuccessor(0) == VMap[ExitBlock] + ? ConstantInt::getFalse(C) + : ConstantInt::getTrue(C); + CloneEntryTerm->setCondition(Cond); + // Add a hint to favor the early exit path. This improves performance when + // the early exit path is common and does not harm performance much if the + // early exit path is rare and the non-early exit path is relatively long + // running. + ExitBlock->moveAfter(&Entry); + SmallVector Weights = { + EntryTerm->getSuccessor(0) == ExitBlock ? 100u : 1u, + EntryTerm->getSuccessor(0) == ExitBlock ? 1u : 100u}; + EntryTerm->setMetadata(LLVMContext::MD_prof, + MDBuilder(C).createBranchWeights(Weights)); + return true; + } +}; + +char PPCEnableShrinkWrap::ID = 0; +INITIALIZE_PASS(PPCEnableShrinkWrap, "create-wrapper-opps", + "Find opportunities for wrapping", false, false) + +ModulePass *llvm::createPPCEnableShrinkWrapPass() { + return new PPCEnableShrinkWrap(); +} Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -74,6 +74,7 @@ PassRegistry &PR = *PassRegistry::getPassRegistry(); initializePPCBoolRetToIntPass(PR); + initializePPCEnableShrinkWrapPass(PR); } /// Return the datalayout string of a subtarget. @@ -304,6 +305,8 @@ void PPCPassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createPPCBoolRetToIntPass()); + if (TM->getOptLevel() > CodeGenOpt::Default) + addPass(createPPCEnableShrinkWrapPass()); addPass(createAtomicExpandPass(&getPPCTargetMachine())); // For the BG/Q (or if explicitly requested), add explicit data prefetch Index: test/CodeGen/PowerPC/EnableShrinkWrapTest.ll =================================================================== --- test/CodeGen/PowerPC/EnableShrinkWrapTest.ll +++ test/CodeGen/PowerPC/EnableShrinkWrapTest.ll @@ -0,0 +1,104 @@ +; RUN: opt -create-wrapper-opps -S -o - < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; CHECK-LABEL: testEnableShrinkWrap +define signext i32 @testEnableShrinkWrap(i32 signext %i) { +entry: + %tobool = icmp eq i32 %i, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry +; CHECK call i32 @testEnableStrinkWrap.cont + tail call void @doSomething() + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: doSomething +declare void @doSomething() + +; CHECK-LABEL: testEnableShrinkWrapVariadic +define signext i32 @testEnableShrinkWrapVariadic(i32 signext %i, ...) { +entry: + %tobool = icmp eq i32 %i, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry +; CHECK: call void @doSomething +tail call void @doSomething() + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: callVariadic +define signext i32 @callVariadic(i32 signext %i) { +entry: + %tobool = icmp eq i32 %i, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry +; CHECK: call void (i32, ...) @doSomethingVariadic + tail call void (i32, ...) @doSomethingVariadic(i32 signext %i, i32 signext 7) + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 %i +} + +; CHECK-LABEL: declare void @doSomethingVariadic +declare void @doSomethingVariadic(i32 signext, ...) + +; CHECK-LABEL: tooComplicatedControlFlow +define signext i32 @tooComplicatedControlFlow(i32 zeroext %i, i32 zeroext %j) { +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %i.addr.0 = phi i32 [ %i, %entry ], [ %dec, %do.body ] +; CHECK: call void @doSomething + tail call void @doSomething() + %dec = add i32 %i.addr.0, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %do.body.1.preheader, label %do.body + +do.body.1.preheader: ; preds = %do.body + br label %do.body.1 + +do.body.1: ; preds = %do.body.1.preheader, %do.body.1 + %j.addr.0 = phi i32 [ %dec3, %do.body.1 ], [ %j, %do.body.1.preheader ] +; CHECK: call void @doSomething + tail call void @doSomething() + %dec3 = add i32 %j.addr.0, -1 + %tobool4 = icmp eq i32 %dec3, 0 + br i1 %tobool4, label %do.end.5, label %do.body.1 + +do.end.5: ; preds = %do.body.1 + ret i32 7 +} + +; CHECK-LABEL: notSideEffectFree +define signext i32 @notSideEffectFree(i32* nocapture readonly %i) { +entry: + %incdec.ptr = getelementptr inbounds i32, i32* %i, i64 1 + %0 = load i32, i32* %i, align 4 + %tobool = icmp eq i32 %0, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry +; CHECK: call void @doSomething + tail call void @doSomething() + br label %if.end + +if.end: ; preds = %entry, %if.then + %1 = load i32, i32* %incdec.ptr, align 4 + ret i32 %1 +} + +; CHECK-LABEL: testEnableShrinkWrap.cont +; CHECK: call void @doSomething