Index: lib/Target/PowerPC/CMakeLists.txt
===================================================================
--- lib/Target/PowerPC/CMakeLists.txt
+++ lib/Target/PowerPC/CMakeLists.txt
@@ -22,6 +22,7 @@
   PPCISelDAGToDAG.cpp
   PPCISelLowering.cpp
   PPCEarlyReturn.cpp
+  PPCEnableShrinkWrap.cpp
   PPCFastISel.cpp
   PPCFrameLowering.cpp
   PPCLoopDataPrefetch.cpp
Index: lib/Target/PowerPC/PPC.h
===================================================================
--- lib/Target/PowerPC/PPC.h
+++ lib/Target/PowerPC/PPC.h
@@ -27,6 +27,7 @@
   class FunctionPass;
   class ImmutablePass;
   class MachineInstr;
+  class ModulePass;
   class AsmPrinter;
   class MCInst;
 
@@ -46,27 +47,29 @@
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
+  ModulePass *createPPCEnableShrinkWrapPass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
 
   void initializePPCVSXFMAMutatePass(PassRegistry&);
   void initializePPCBoolRetToIntPass(PassRegistry&);
+  void initializePPCEnableShrinkWrapPass(PassRegistry&);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
-    
+
   /// Target Operand Flag enum.
   enum TOF {
     //===------------------------------------------------------------------===//
     // PPC Specific MachineOperand flags.
     MO_NO_FLAG,
-    
+
     /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the
     /// reference is actually to the "FOO$stub" or "FOO@plt" symbol.  This is
     /// used for calls and jumps to external functions on Tiger and earlier, and
     /// for PIC calls on Linux and ELF systems.
     MO_PLT_OR_STUB = 1,
-    
+
     /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
     /// the function's picbase, e.g. lo16(symbol-picbase).
     MO_PIC_FLAG = 2,
@@ -74,7 +77,7 @@
     /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
     /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
     MO_NLP_FLAG = 4,
-    
+
     /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
     /// symbol with hidden visibility.  This causes a different kind of
     /// non-lazy-pointer to be generated.
@@ -100,7 +103,7 @@
     MO_TLS       = 8 << 4
   };
   } // end namespace PPCII
-  
+
 } // end namespace llvm;
 
 #endif
Index: lib/Target/PowerPC/PPCEnableShrinkWrap.cpp
===================================================================
--- lib/Target/PowerPC/PPCEnableShrinkWrap.cpp
+++ lib/Target/PowerPC/PPCEnableShrinkWrap.cpp
@@ -0,0 +1,270 @@
+//===- PPCEnableShrinkWrap.cpp - split functions to create shrink wrapping
+// opportunities ==/
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Shrink wrapping functions can improve performance for functions when the
+// stack is unused. However, all functions where value live-ranges contain
+// function calls are ineligible for shrink wrapping. Why?
+//
+// 1. To avoid saving and restoring values across call-sites, LLVM allocates
+// values with live-ranges that cross call sites to callee-saved registers
+//
+// 2. If callee-saved registers are used, they must be saved (usually in the
+// function prolog)
+//
+// 3. ShrinkWrapping scans from the beginning of the function prolog to the
+// first use of the stack. If a callee-saved register is saved to the stack in
+// the function prolog, ShrinkWrapping will give up early.
+//
+// To increase the applicability of ShrinkWrapping, this pass identifies
+// instances where a live-range straddling a call-site prevents ShrinkWrapping,
+// and splits the original function into a stackless ShrinkWrappable stub that
+// potentially makes a tail call to the remainder of the function.
+//
+// This transformation harms debuggability and consequently is not suitable for
+// lower -O levels.
+//
+//===----------------------------------------------------------------------===//
+#include "PPC.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+static cl::
+opt<bool> DisablePreIncPrep("disable-ppc-enable-shrink-wrap", cl::Hidden,
+                            cl::desc("Disable PPC enable shrink wrap pass"));
+
+class PPCEnableShrinkWrap : public ModulePass {
+
+  static BasicBlock *getUniqueExitBlock(Function &F) {
+    BasicBlock *ExitBlock = nullptr;
+    for (auto &BB : F) {
+      if (isa<ReturnInst>(BB.getTerminator())) {
+        if (ExitBlock)
+          return nullptr;
+        else
+          ExitBlock = &BB;
+      }
+    }
+    return ExitBlock;
+  }
+
+  static bool branchesTo(TerminatorInst *term, BasicBlock *succ) {
+    for (unsigned i = 0; i < term->getNumSuccessors(); ++i)
+      if (term->getSuccessor(i) == succ)
+        return true;
+    return false;
+  }
+
+  static bool canBeComputedWithoutStack(llvm::Value *V) {
+    if (isa<Constant>(V))
+      return true;
+    if (isa<Argument>(V))
+      return true;
+    if (isa<PHINode>(V))
+      return true;
+    if (isa<BasicBlock>(V))
+      return true;
+    if (isa<AllocaInst>(V))
+      return false;
+    if (IntrinsicInst *I = dyn_cast<IntrinsicInst>(V))
+      return
+        I->getIntrinsicID() == Intrinsic::lifetime_end ||
+        I->getIntrinsicID() == Intrinsic::lifetime_start;
+    if (isa<DbgInfoIntrinsic>(V))
+      return true;
+    if (isa<CallInst>(V))
+      return false;
+    // Some casts and extracts require a stack slot for proc <= P7
+    // Conservatively assume all casts and extracts use the stack, since there
+    // is no easy way to query this information from the Target
+    if (isa<CastInst>(V) || isa<ExtractValueInst>(V))
+      return false;
+
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return false;
+    if (I->mayHaveSideEffects())
+      return false;
+    if (I->mayReadFromMemory())
+      return false;
+
+    for (unsigned i = 0; i < I->getNumOperands(); ++i)
+      if (!canBeComputedWithoutStack(I->getOperand(i)))
+        return false;
+
+    return true;
+  }
+
+  static bool canBeComputedWithoutStack(BasicBlock *exitBlock,
+                                        BasicBlock &entry) {
+    for (auto &I : *exitBlock) {
+      if (PHINode *Phi = dyn_cast<PHINode>(&I)) {
+        if (!canBeComputedWithoutStack(Phi->getIncomingValueForBlock(&entry)))
+          return false;
+      } else if (!canBeComputedWithoutStack(&I)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static bool hasCallInst(Function &F) {
+    for (auto &BB : F)
+      for (auto &I : BB)
+        if (isa<CallInst>(I) && !isa<IntrinsicInst>(I))
+          return true;
+    return false;
+  }
+
+  static bool hasVariadicCall(Function &F) {
+    for (auto &BB : F)
+      for (auto &I : BB)
+        if (CallInst *CI = dyn_cast<CallInst>(&I))
+          if (CI->getFunctionType()->isVarArg())
+            return true;
+    return false;
+  }
+
+  static bool mayHaveSideEffects(BasicBlock &BB) {
+    for (auto &I : BB)
+      if (I.mayHaveSideEffects())
+        return true;
+    return false;
+  }
+
+public:
+  static char ID;
+  PPCEnableShrinkWrap() : ModulePass(ID) {
+    initializePPCEnableShrinkWrapPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  bool runOnModule(Module &M) {
+    bool Changed = false;
+    if (DisablePreIncPrep) return Changed;
+    SmallVector<Function *, 16> Funcs;
+    for (auto &F : M)
+      Funcs.emplace_back(&F);
+    for (auto &F : Funcs)
+      if (!F->isDeclaration())
+        Changed |= runOnFunction(*F);
+    return Changed;
+  }
+
+  static bool runOnFunction(Function &F) {
+    // Presently, only implemented for non-variadic functions that don't call
+    // variadic functions and that contain at least one callsite.
+    if (F.isVarArg())
+      return false;
+    if (!hasCallInst(F))
+      return false;
+    if (hasVariadicCall(F))
+      return false;
+
+    // Only unique exit blocks are handled
+    BasicBlock *ExitBlock = getUniqueExitBlock(F);
+    if (!ExitBlock)
+      return false;
+
+    // Look for the pattern of entry blocks that immediately branch to exit
+    // where the entry block and the exit block can be computed without the
+    // stack.
+    BasicBlock &Entry = F.getEntryBlock();
+    BranchInst *EntryTerm = dyn_cast<BranchInst>(Entry.getTerminator());
+    if (!EntryTerm)
+      return false;
+    if (EntryTerm->isUnconditional())
+      return false;
+    if (isa<Constant>(EntryTerm->getCondition()))
+      return false;
+    if (!branchesTo(EntryTerm, ExitBlock))
+      return false;
+    if (!canBeComputedWithoutStack(EntryTerm))
+      return false;
+    if (!canBeComputedWithoutStack(ExitBlock, Entry))
+      return false;
+    if (mayHaveSideEffects(Entry))
+      return false;
+
+    // Clone the original function
+    ValueToValueMapTy VMap;
+    ClonedCodeInfo CodeInfo;
+    Function *CloneF = CloneFunction(&F, VMap, true, &CodeInfo);
+    CloneF->setName(F.getName() + ".cont");
+    CloneF->setLinkage(GlobalValue::InternalLinkage);
+    F.getParent()->getFunctionList().push_back(CloneF);
+
+    // Replace the non-exit block successor of the entry block that makes a tail
+    // call to the cloned version of the function.
+    LLVMContext &C = F.getContext();
+    BasicBlock *TailCallBB = BasicBlock::Create(C, "tailCallBB", &F);
+    for (unsigned i = 0; i < EntryTerm->getNumSuccessors(); ++i) {
+      BasicBlock *Succ = EntryTerm->getSuccessor(i);
+      if (Succ != ExitBlock) {
+        for (auto I = Succ->begin(); &*I != Succ->getFirstNonPHI(); ++I) {
+          PHINode *Phi = cast<PHINode>(I);
+          Phi->removeIncomingValue(&Entry);
+        }
+        EntryTerm->setSuccessor(i, TailCallBB);
+      }
+    }
+    // Create the tail call
+    SmallVector<Value *, 16> Args;
+    for (auto &Arg : F.args())
+      Args.emplace_back(&Arg);
+    CallInst *TailCall = CallInst::Create(CloneF, Args, "", TailCallBB);
+    TailCall->setTailCall(true);
+    TailCall->setCallingConv(CloneF->getCallingConv());
+    // Return the result of the tail call if necessary.
+    if (F.getReturnType()->isVoidTy())
+      ReturnInst::Create(C, TailCallBB);
+    else
+      ReturnInst::Create(C, TailCall, TailCallBB);
+
+    // In the clone function, we know the non-early exit path will be taken
+    BranchInst *CloneEntryTerm = cast<BranchInst>(VMap[EntryTerm]);
+    Value *Cond = CloneEntryTerm->getSuccessor(0) == VMap[ExitBlock]
+                      ? ConstantInt::getFalse(C)
+                      : ConstantInt::getTrue(C);
+    CloneEntryTerm->setCondition(Cond);
+    // Add a hint to favor the early exit path. This improves performance when
+    // the early exit path is common and does not harm performance much if the
+    // early exit path is rare and the non-early exit path is relatively long
+    // running.
+    ExitBlock->moveAfter(&Entry);
+    SmallVector<uint32_t, 2> Weights = {
+        EntryTerm->getSuccessor(0) == ExitBlock ? 100u : 1u,
+        EntryTerm->getSuccessor(0) == ExitBlock ? 1u : 100u};
+    EntryTerm->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(C).createBranchWeights(Weights));
+    return true;
+  }
+};
+
+char PPCEnableShrinkWrap::ID = 0;
+INITIALIZE_PASS(PPCEnableShrinkWrap, "create-wrapper-opps",
+                "Find opportunities for wrapping", false, false)
+
+ModulePass *llvm::createPPCEnableShrinkWrapPass() {
+  return new PPCEnableShrinkWrap();
+}
Index: lib/Target/PowerPC/PPCTargetMachine.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetMachine.cpp
+++ lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -74,6 +74,7 @@
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializePPCBoolRetToIntPass(PR);
+  initializePPCEnableShrinkWrapPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -304,6 +305,8 @@
 void PPCPassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createPPCBoolRetToIntPass());
+  if (TM->getOptLevel() > CodeGenOpt::Default)
+    addPass(createPPCEnableShrinkWrapPass());
   addPass(createAtomicExpandPass(&getPPCTargetMachine()));
 
   // For the BG/Q (or if explicitly requested), add explicit data prefetch
Index: test/CodeGen/PowerPC/EnableShrinkWrapTest.ll
===================================================================
--- test/CodeGen/PowerPC/EnableShrinkWrapTest.ll
+++ test/CodeGen/PowerPC/EnableShrinkWrapTest.ll
@@ -0,0 +1,104 @@
+; RUN: opt -create-wrapper-opps -S -o - < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-LABEL: testEnableShrinkWrap
+define signext i32 @testEnableShrinkWrap(i32 signext %i) {
+entry:
+  %tobool = icmp eq i32 %i, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+; CHECK call i32 @testEnableStrinkWrap.cont
+  tail call void @doSomething()
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret i32 %i
+}
+
+; CHECK-LABEL: doSomething
+declare void @doSomething()
+
+; CHECK-LABEL: testEnableShrinkWrapVariadic
+define signext i32 @testEnableShrinkWrapVariadic(i32 signext %i, ...) {
+entry:
+  %tobool = icmp eq i32 %i, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+; CHECK: call void @doSomething
+tail call void @doSomething()
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret i32 %i
+}
+
+; CHECK-LABEL: callVariadic
+define signext i32 @callVariadic(i32 signext %i) {
+entry:
+  %tobool = icmp eq i32 %i, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+; CHECK: call void (i32, ...) @doSomethingVariadic
+  tail call void (i32, ...) @doSomethingVariadic(i32 signext %i, i32 signext 7)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret i32 %i
+}
+
+; CHECK-LABEL: declare void @doSomethingVariadic
+declare void @doSomethingVariadic(i32 signext, ...)
+
+; CHECK-LABEL: tooComplicatedControlFlow
+define signext i32 @tooComplicatedControlFlow(i32 zeroext %i, i32 zeroext %j) {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %i.addr.0 = phi i32 [ %i, %entry ], [ %dec, %do.body ]
+; CHECK: call void @doSomething
+  tail call void @doSomething()
+  %dec = add i32 %i.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.body.1.preheader, label %do.body
+
+do.body.1.preheader:                              ; preds = %do.body
+  br label %do.body.1
+
+do.body.1:                                        ; preds = %do.body.1.preheader, %do.body.1
+  %j.addr.0 = phi i32 [ %dec3, %do.body.1 ], [ %j, %do.body.1.preheader ]
+; CHECK: call void @doSomething
+  tail call void @doSomething()
+  %dec3 = add i32 %j.addr.0, -1
+  %tobool4 = icmp eq i32 %dec3, 0
+  br i1 %tobool4, label %do.end.5, label %do.body.1
+
+do.end.5:                                         ; preds = %do.body.1
+  ret i32 7
+}
+
+; CHECK-LABEL: notSideEffectFree
+define signext i32 @notSideEffectFree(i32* nocapture readonly %i) {
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %i, i64 1
+  %0 = load i32, i32* %i, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+; CHECK: call void @doSomething
+  tail call void @doSomething()
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %1 = load i32, i32* %incdec.ptr, align 4
+  ret i32 %1
+}
+
+; CHECK-LABEL: testEnableShrinkWrap.cont
+; CHECK: call void @doSomething