diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -192,11 +192,11 @@
 ModulePass *createBlockExtractorPass();
 ModulePass *
 createBlockExtractorPass(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
-                         bool EraseFunctions);
+                         bool EraseFunctions, bool KeepOldBlocks = false);
 ModulePass *
 createBlockExtractorPass(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
                              &GroupsOfBlocksToExtract,
-                         bool EraseFunctions);
+                         bool EraseFunctions, bool KeepOldBlocks = false);
 
 /// createStripDeadPrototypesPass - This pass removes any function declarations
 /// (prototypes) that are not used.
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -112,10 +112,16 @@
 /// If you would like to collect additional information about the cloned
 /// function, you can specify a ClonedCodeInfo object with the optional fifth
 /// parameter.
-BasicBlock *CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
-                            const Twine &NameSuffix = "", Function *F = nullptr,
-                            ClonedCodeInfo *CodeInfo = nullptr,
-                            DebugInfoFinder *DIFinder = nullptr);
+///
+/// If you would like to clone only a subset of instructions in the basic block,
+/// you can specify a callback that returns true only for those instructions
+/// that are to be cloned with the optional seventh paramter.
+BasicBlock *
+CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
+                const Twine &NameSuffix = "", Function *F = nullptr,
+                ClonedCodeInfo *CodeInfo = nullptr,
+                DebugInfoFinder *DIFinder = nullptr,
+                function_ref<bool(const Instruction *)> InstSelect = {});
 
 /// Return a copy of the specified function and add it to that
 /// function's module.  Also, any references specified in the VMap are changed
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -101,6 +101,10 @@
     // If true, varargs functions can be extracted.
     bool AllowVarArgs;
 
+    /// If true, copies the code into the extracted function instead of moving
+    /// it.
+    bool KeepOldBlocks;
+
     // Bits of intermediate state computed at various phases of extraction.
     SetVector<BasicBlock *> Blocks;
     unsigned NumExitBlocks = std::numeric_limits<unsigned>::max();
@@ -132,13 +136,18 @@
     /// Any new allocations will be placed in the AllocationBlock, unless
     /// it is null, in which case it will be placed in the entry block of
     /// the function from which the code is being extracted.
+    ///
+    /// If KeepOldBlocks is true, the original instances of the extracted region
+    /// remains in the original function so they can still be branched to from
+    /// non-extracted blocks. However, only branches to the first block will
+    /// call the extracted function.
     CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
                   bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
                   BranchProbabilityInfo *BPI = nullptr,
                   AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
                   bool AllowAlloca = false,
                   BasicBlock *AllocationBlock = nullptr,
-                  std::string Suffix = "");
+                  std::string Suffix = "", bool KeepOldBlocks = false);
 
     /// Create a code extractor for a loop body.
     ///
diff --git a/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
--- a/llvm/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
@@ -43,7 +43,8 @@
 namespace {
 class BlockExtractor {
 public:
-  BlockExtractor(bool EraseFunctions) : EraseFunctions(EraseFunctions) {}
+  BlockExtractor(bool EraseFunctions, bool KeepOldBlocks = false)
+      : EraseFunctions(EraseFunctions), KeepOldBlocks(KeepOldBlocks) {}
   bool runOnModule(Module &M);
   void init(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
                 &GroupsOfBlocksToExtract) {
@@ -60,6 +61,7 @@
 private:
   SmallVector<SmallVector<BasicBlock *, 16>, 4> GroupsOfBlocks;
   bool EraseFunctions;
+  bool KeepOldBlocks;
   /// Map a function name to groups of blocks.
   SmallVector<std::pair<std::string, SmallVector<std::string, 4>>, 4>
       BlocksByName;
@@ -75,8 +77,8 @@
 public:
   static char ID;
   BlockExtractorLegacyPass(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
-                           bool EraseFunctions)
-      : ModulePass(ID), BE(EraseFunctions) {
+                           bool EraseFunctions, bool KeepOldBlocks)
+      : ModulePass(ID), BE(EraseFunctions, KeepOldBlocks) {
     // We want one group per element of the input list.
     SmallVector<SmallVector<BasicBlock *, 16>, 4> MassagedGroupsOfBlocks;
     for (BasicBlock *BB : BlocksToExtract) {
@@ -89,13 +91,14 @@
 
   BlockExtractorLegacyPass(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
                                &GroupsOfBlocksToExtract,
-                           bool EraseFunctions)
-      : ModulePass(ID), BE(EraseFunctions) {
+                           bool EraseFunctions, bool KeepOldBlocks)
+      : ModulePass(ID), BE(EraseFunctions, KeepOldBlocks) {
     BE.init(GroupsOfBlocksToExtract);
   }
 
   BlockExtractorLegacyPass()
-      : BlockExtractorLegacyPass(SmallVector<BasicBlock *, 0>(), false) {}
+      : BlockExtractorLegacyPass(SmallVector<BasicBlock *, 0>(), false, false) {
+  }
 };
 
 } // end anonymous namespace
@@ -108,14 +111,17 @@
   return new BlockExtractorLegacyPass();
 }
 ModulePass *llvm::createBlockExtractorPass(
-    const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions) {
-  return new BlockExtractorLegacyPass(BlocksToExtract, EraseFunctions);
+    const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions,
+    bool KeepOldBlocks) {
+  return new BlockExtractorLegacyPass(BlocksToExtract, EraseFunctions,
+                                      KeepOldBlocks);
 }
 ModulePass *llvm::createBlockExtractorPass(
     const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
         &GroupsOfBlocksToExtract,
-    bool EraseFunctions) {
-  return new BlockExtractorLegacyPass(GroupsOfBlocksToExtract, EraseFunctions);
+    bool EraseFunctions, bool KeepOldBlocks) {
+  return new BlockExtractorLegacyPass(GroupsOfBlocksToExtract, EraseFunctions,
+                                      KeepOldBlocks);
 }
 
 /// Gets all of the blocks specified in the input file.
@@ -223,7 +229,17 @@
       Changed = true;
     }
     CodeExtractorAnalysisCache CEAC(*BBs[0]->getParent());
-    Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(CEAC);
+    Function *F = CodeExtractor(BlocksToExtractVec,
+                                /* DT */ nullptr,
+                                /* AggregateArgs*/ false,
+                                /* BFI */ nullptr,
+                                /* BPI */ nullptr,
+                                /* AC */ nullptr,
+                                /* AllowVarArgs */ false,
+                                /* AllowAlloca */ false,
+                                /* AllocationBlock */ nullptr,
+                                /* Suffix */ "", KeepOldBlocks)
+                      .extractCodeRegion(CEAC);
     if (F)
       LLVM_DEBUG(dbgs() << "Extracted group '" << (*BBs.begin())->getName()
                         << "' in: " << F->getName() << '\n');
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -38,10 +38,11 @@
 #define DEBUG_TYPE "clone-function"
 
 /// See comments in Cloning.h.
-BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
-                                  const Twine &NameSuffix, Function *F,
-                                  ClonedCodeInfo *CodeInfo,
-                                  DebugInfoFinder *DIFinder) {
+BasicBlock *
+llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
+                      const Twine &NameSuffix, Function *F,
+                      ClonedCodeInfo *CodeInfo, DebugInfoFinder *DIFinder,
+                      function_ref<bool(const Instruction *)> InstSelect) {
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
   if (BB->hasName())
     NewBB->setName(BB->getName() + NameSuffix);
@@ -51,6 +52,9 @@
 
   // Loop over all instructions, and copy them over.
   for (const Instruction &I : *BB) {
+    if (InstSelect && !InstSelect(&I))
+      continue;
+
     if (DIFinder && TheModule)
       DIFinder->processInstruction(*TheModule, I);
 
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -61,6 +61,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -195,7 +197,8 @@
 /// Build a set of blocks to extract if the input blocks are viable.
 static SetVector<BasicBlock *>
 buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
-                        bool AllowVarArgs, bool AllowAlloca) {
+                        bool AllowVarArgs, bool AllowAlloca,
+                        bool KeepOldBlocks) {
   assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
   SetVector<BasicBlock *> Result;
 
@@ -227,16 +230,20 @@
     }
 
     // All blocks other than the first must not have predecessors outside of
-    // the subgraph which is being extracted.
-    for (auto *PBB : predecessors(BB))
-      if (!Result.count(PBB)) {
-        LLVM_DEBUG(dbgs() << "No blocks in this region may have entries from "
-                             "outside the region except for the first block!\n"
-                          << "Problematic source BB: " << BB->getName() << "\n"
-                          << "Problematic destination BB: " << PBB->getName()
-                          << "\n");
-        return {};
-      }
+    // the subgraph which is being extracted. KeepOldBlocks relaxes this
+    // requirement.
+    if (!KeepOldBlocks) {
+      for (auto *PBB : predecessors(BB))
+        if (!Result.count(PBB)) {
+          LLVM_DEBUG(dbgs()
+                     << "No blocks in this region may have entries from "
+                        "outside the region except for the first block!\n"
+                     << "Problematic source BB: " << BB->getName() << "\n"
+                     << "Problematic destination BB: " << PBB->getName()
+                     << "\n");
+          return {};
+        }
+    }
   }
 
   return Result;
@@ -246,11 +253,13 @@
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
                              BranchProbabilityInfo *BPI, AssumptionCache *AC,
                              bool AllowVarArgs, bool AllowAlloca,
-                             BasicBlock *AllocationBlock, std::string Suffix)
+                             BasicBlock *AllocationBlock, std::string Suffix,
+                             bool KeepOldBlocks)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AC(AC), AllocationBlock(AllocationBlock),
-      AllowVarArgs(AllowVarArgs),
-      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
+      AllowVarArgs(AllowVarArgs), KeepOldBlocks(KeepOldBlocks),
+      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca,
+                                     KeepOldBlocks)),
       Suffix(Suffix) {}
 
 CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
@@ -259,9 +268,11 @@
                              std::string Suffix)
     : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AC(AC), AllocationBlock(nullptr), AllowVarArgs(false),
+      KeepOldBlocks(false),
       Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
                                      /* AllowVarArgs */ false,
-                                     /* AllowAlloca */ false)),
+                                     /* AllowAlloca */ false,
+                                     /* KeepOldBlocks */ false)),
       Suffix(Suffix) {}
 
 /// definedInRegion - Return true if the specified value is defined in the
@@ -648,6 +659,10 @@
     // If a used value is defined outside the region, it's an input.  If an
     // instruction is used outside the region, it's an output.
     for (Instruction &II : *BB) {
+      // Ignore assumptions if not been removed yet.
+      if (isa<AssumeInst>(II))
+        continue;
+
       for (auto &OI : II.operands()) {
         Value *V = OI;
         if (!SinkCands.count(V) && definedInCaller(Blocks, V))
@@ -1335,14 +1350,16 @@
 
   normalizeCFGForExtraction(header);
 
-  // Remove @llvm.assume calls that will be moved to the new function from the
-  // old function's assumption cache.
-  for (BasicBlock *Block : Blocks) {
-    for (Instruction &I : llvm::make_early_inc_range(*Block)) {
-      if (auto *AI = dyn_cast<AssumeInst>(&I)) {
-        if (AC)
-          AC->unregisterAssumption(AI);
-        AI->eraseFromParent();
+  if (!KeepOldBlocks) {
+    // Remove @llvm.assume calls that will be moved to the new function from the
+    // old function's assumption cache.
+    for (BasicBlock *Block : Blocks) {
+      for (Instruction &I : llvm::make_early_inc_range(*Block)) {
+        if (auto *AI = dyn_cast<AssumeInst>(&I)) {
+          if (AC)
+            AC->unregisterAssumption(AI);
+          AI->eraseFromParent();
+        }
       }
     }
   }
@@ -1400,8 +1417,10 @@
   // Determine position for the replacement code. Do so before header is moved
   // to the new function.
   BasicBlock *ReplIP = header;
-  while (ReplIP && Blocks.count(ReplIP))
-    ReplIP = ReplIP->getNextNode();
+  if (!KeepOldBlocks) {
+    while (ReplIP && Blocks.count(ReplIP))
+      ReplIP = ReplIP->getNextNode();
+  }
 
   // Construct new function based on inputs/outputs & add allocas for all defs.
   std::string SuffixToUse =
@@ -1463,6 +1482,31 @@
   // individually.
   recomputeExitBlocks();
   severSplitPHINodesOfExits();
+
+  // If the option was given, ensure there are no PHI nodes at all in the exit
+  // nodes themselves.
+  if (KeepOldBlocks) {
+    for (BasicBlock *Block : Blocks) {
+      for (BasicBlock *Succ : make_early_inc_range(successors(Block))) {
+        if (Blocks.count(Succ))
+          continue;
+
+        if (!Succ->getSinglePredecessor())
+          Succ = SplitEdge(Block, Succ, DT);
+
+        // Ensure no PHI node in exit block (still possible with single
+        // predecessor, e.g. LCSSA)
+        while (auto *P = dyn_cast<PHINode>(&Succ->front())) {
+          assert(P->getNumIncomingValues() == 1);
+          P->replaceAllUsesWith(P->getIncomingValue(0));
+          P->eraseFromParent();
+        }
+      }
+    }
+
+    // Exit nodes may have changed by SplitEdge.
+    recomputeExitBlocks();
+  }
 }
 
 void CodeExtractor::recomputeExitBlocks() {
@@ -1494,18 +1538,43 @@
   BasicBlock *newFuncRoot =
       BasicBlock::Create(Context, "newFuncRoot", newFunction);
 
+  // The map of values from the original function to the corresponding values in
+  // the extracted function; only used with KeepOldBlocks.
+  ValueToValueMapTy VMap;
+
+  // Additional instructions not in a extracted block whose operands need to be
+  // remapped.
+  SmallVector<Instruction *> AdditionalRemap;
+
+  // Copy or move (depending on KeepOldBlocks) an instruction to the new
+  // function.
+  auto MoveOrCopyInst = [this, newFuncRoot, &VMap,
+                         &AdditionalRemap](Instruction *I) -> Instruction * {
+    BasicBlock::iterator IP = newFuncRoot->getFirstInsertionPt();
+    if (!KeepOldBlocks) {
+      I->moveBefore(*newFuncRoot, IP);
+      return I;
+    }
+
+    Instruction *ClonedI = I->clone();
+    ClonedI->setName(I->getName());
+    newFuncRoot->getInstList().insert(IP, ClonedI);
+    AdditionalRemap.push_back(ClonedI);
+    VMap[I] = ClonedI;
+    return ClonedI;
+  };
+
   // Now sink all instructions which only have non-phi uses inside the region.
   // Group the allocas at the start of the block, so that any bitcast uses of
   // the allocas are well-defined.
   for (auto *II : SinkingCands) {
     if (!isa<AllocaInst>(II)) {
-      cast<Instruction>(II)->moveBefore(*newFuncRoot,
-                                        newFuncRoot->getFirstInsertionPt());
+      MoveOrCopyInst(cast<Instruction>(II));
     }
   }
   for (auto *II : SinkingCands) {
     if (auto *AI = dyn_cast<AllocaInst>(II)) {
-      AI->moveBefore(*newFuncRoot, newFuncRoot->getFirstInsertionPt());
+      MoveOrCopyInst(AI);
     }
   }
 
@@ -1534,16 +1603,58 @@
     NewValues.push_back(RewriteVal);
   }
 
-  moveCodeToFunction(newFunction);
+  if (KeepOldBlocks) {
+    // Copy blocks and instrutions to newFunction.
+    for (BasicBlock *Block : Blocks) {
+      BasicBlock *CBB = CloneBasicBlock(
+          Block, VMap, {}, newFunction, /* CodeInfo */ nullptr,
+          /* DIFinder */ nullptr,
+          [](const Instruction *I) -> bool { return !isa<AssumeInst>(I); });
+
+      // Add basic block mapping.
+      VMap[Block] = CBB;
+
+      // It is only legal to clone a function if a block address within that
+      // function is never referenced outside of the function.  Given that, we
+      // want to map block addresses from the old function to block addresses in
+      // the clone. (This is different from the generic ValueMapper
+      // implementation, which generates an invalid blockaddress when
+      // cloning a function.)
+      if (Block->hasAddressTaken()) {
+        Constant *OldBBAddr = BlockAddress::get(oldFunction, Block);
+        VMap[OldBBAddr] = BlockAddress::get(newFunction, CBB);
+      }
+
+      // Non-header block may have branches from outside the region. These
+      // continue to branch to the original blocks, hence remove their PHI
+      // entries.
+      if (Block != header)
+        for (auto &&P : CBB->phis()) {
+          unsigned NumIncoming = P.getNumIncomingValues();
+          for (int Idx = NumIncoming - 1; Idx >= 0; --Idx) {
+            BasicBlock *IncomingBlock = P.getIncomingBlock(Idx);
+            if (Blocks.count(IncomingBlock))
+              continue;
+            P.removeIncomingValue(Idx, /*DeletePHIIfEmpty=*/false);
+          }
+        }
+    }
+
+    for (auto P : enumerate(inputs))
+      VMap[P.value()] = NewValues[P.index()];
 
-  for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
-    Value *RewriteVal = NewValues[i];
+  } else {
+    moveCodeToFunction(newFunction);
 
-    std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end());
-    for (User *use : Users)
-      if (Instruction *inst = dyn_cast<Instruction>(use))
-        if (Blocks.count(inst->getParent()))
-          inst->replaceUsesOfWith(inputs[i], RewriteVal);
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+      Value *RewriteVal = NewValues[i];
+
+      std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end());
+      for (User *use : Users)
+        if (Instruction *inst = dyn_cast<Instruction>(use))
+          if (Blocks.count(inst->getParent()))
+            inst->replaceUsesOfWith(inputs[i], RewriteVal);
+    }
   }
 
   // Since there may be multiple exits from the original region, make the new
@@ -1562,6 +1673,8 @@
     BasicBlock *NewTarget = BasicBlock::Create(
         Context, OldTarget->getName() + ".exitStub", newFunction);
     ExitBlockMap[OldTarget] = NewTarget;
+    if (KeepOldBlocks)
+      VMap[OldTarget] = NewTarget;
 
     Value *brVal = nullptr;
     assert(NumExitBlocks < 0xffff && "too many exit blocks for switch");
@@ -1590,22 +1703,54 @@
       BasicBlock *NewTarget = ExitBlockMap[OldTarget];
       assert(NewTarget && "Unknown target block!");
 
-      // rewrite the original branch instruction with this new target
-      TI->setSuccessor(i, NewTarget);
+      if (KeepOldBlocks) {
+        VMap[OldTarget] = NewTarget;
+      } else {
+        // rewrite the original branch instruction with this new target
+        TI->setSuccessor(i, NewTarget);
+      }
     }
   }
 
-  // Loop over all of the PHI nodes in the header and exit blocks, and change
-  // any references to the old incoming edge to be the new incoming edge.
-  for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!Blocks.count(PN->getIncomingBlock(i)))
-        PN->setIncomingBlock(i, newFuncRoot);
+  // Update values references to point to the new function.
+  if (KeepOldBlocks) {
+    for (BasicBlock *Pred : predecessors(header)) {
+      if (VMap.count(Pred))
+        continue;
+      VMap[Pred] = newFuncRoot;
+    }
+
+    for (Instruction *II : AdditionalRemap)
+      RemapInstruction(II, VMap, RF_NoModuleLevelChanges);
+
+    // Loop over all of the instructions in the new function, fixing up operand
+    // references as we go. This uses VMap to do all the hard work.
+    for (BasicBlock *Block : Blocks) {
+      WeakTrackingVH NewBlock = VMap.lookup(Block);
+      if (!NewBlock)
+        continue;
+
+      // Loop over all instructions, fixing each one as we find it...
+      for (Instruction &II : cast<BasicBlock>(*NewBlock))
+        RemapInstruction(&II, VMap, RF_NoModuleLevelChanges);
+    }
+  } else {
+    // Loop over all of the PHI nodes in the header and exit blocks, and change
+    // any references to the old incoming edge to be the new incoming edge.
+    for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (!Blocks.count(PN->getIncomingBlock(i)))
+          PN->setIncomingBlock(i, newFuncRoot);
+    }
   }
 
+  BasicBlock *NewHeader =
+      KeepOldBlocks ? cast<BasicBlock>(VMap.lookup(header)) : header;
+  assert(NewHeader && "Header must have been cloned/moved to newFunction");
+
   // Connect newFunction entry block to new header.
-  BranchInst *BranchI = BranchInst::Create(header, newFuncRoot);
+  BranchInst *BranchI = BranchInst::Create(NewHeader, newFuncRoot);
   applyFirstDebugLoc(oldFunction, Blocks.getArrayRef(), BranchI);
 
   // Store the arguments right after the definition of output value.
@@ -1613,7 +1758,6 @@
   // result restore will be placed in the outlined function.
   ScalarAI = newFunction->arg_begin();
   unsigned AggIdx = 0;
-
   for (Value *Input : inputs) {
     if (StructValues.contains(Input))
       ++AggIdx;
@@ -1622,6 +1766,9 @@
   }
 
   for (Value *Output : outputs) {
+    if (KeepOldBlocks)
+      Output = VMap.lookup(Output);
+
     // Find proper insertion point.
     // In case Output is an invoke, we insert the store at the beginning in the
     // 'normal destination' BB. Otherwise we insert the store right after
@@ -1866,35 +2013,72 @@
           !Blocks.count(I->getParent()))
         I->replaceUsesOfWith(header, codeReplacer);
 
-  // When moving the code region it is sufficient to replace all uses to the
-  // extracted function values. Since the original definition's block
-  // dominated its use, it will also be dominated by codeReplacer's switch
-  // which joined multiple exit blocks.
-  for (BasicBlock *ExitBB : SwitchCases)
-    for (PHINode &PN : ExitBB->phis()) {
-      Value *IncomingCodeReplacerVal = nullptr;
-      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
-        // Ignore incoming values from outside of the extracted region.
-        if (!Blocks.count(PN.getIncomingBlock(i)))
+  if (KeepOldBlocks) {
+    // Change references to output values after the call to use either the value
+    // written by the extracted function or the original value if we skipped the
+    // call. Use SSAUpdater to propagate the new PHI since the CFG has changed.
+
+    SSAUpdater SSA;
+    for (auto P : enumerate(outputs)) {
+      size_t OutIdx = P.index();
+      Instruction *OldVal = cast<Instruction>(P.value());
+      Value *NewVal = Reloads[OutIdx];
+
+      SSA.Initialize(OldVal->getType(),
+                     (OldVal->getName() + ".merge_with_extracted").str());
+      SSA.AddAvailableValue(codeReplacer, NewVal);
+
+      // Could help SSAUpdater by determining in advance which output values are
+      // available in which exit blocks (from DT).
+      SSA.AddAvailableValue(OldVal->getParent(), OldVal);
+
+      for (Use &U : make_early_inc_range(OldVal->uses())) {
+        auto *User = dyn_cast<Instruction>(U.getUser());
+        if (!User)
           continue;
+        BasicBlock *EffectiveUser = User->getParent();
+        if (auto *PHI = dyn_cast<PHINode>(User))
+          EffectiveUser = PHI->getIncomingBlock(U);
 
-        // Ensure that there is only one incoming value from codeReplacer.
-        if (!IncomingCodeReplacerVal) {
-          PN.setIncomingBlock(i, codeReplacer);
-          IncomingCodeReplacerVal = PN.getIncomingValue(i);
-        } else
-          assert(IncomingCodeReplacerVal == PN.getIncomingValue(i) &&
-                 "PHI has two incompatbile incoming values from codeRepl");
+        if (EffectiveUser == codeReplacer || Blocks.count(EffectiveUser))
+          continue;
+
+        SSA.RewriteUseAfterInsertions(U);
       }
     }
+  } else {
+    // When moving the code region it is sufficient to replace all uses to the
+    // extracted function values. Since the original definition's block
+    // dominated its use, it will also be dominated by codeReplacer's switch
+    // which joined multiple exit blocks.
+
+    for (BasicBlock *ExitBB : SwitchCases)
+      for (PHINode &PN : ExitBB->phis()) {
+        Value *IncomingCodeReplacerVal = nullptr;
+        for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+          // Ignore incoming values from outside of the extracted region.
+          if (!Blocks.count(PN.getIncomingBlock(i)))
+            continue;
 
-  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
-    Value *load = Reloads[i];
-    std::vector<User *> Users(outputs[i]->user_begin(), outputs[i]->user_end());
-    for (unsigned u = 0, e = Users.size(); u != e; ++u) {
-      Instruction *inst = cast<Instruction>(Users[u]);
-      if (inst->getParent()->getParent() == oldFunction)
-        inst->replaceUsesOfWith(outputs[i], load);
+          // Ensure that there is only one incoming value from codeReplacer.
+          if (!IncomingCodeReplacerVal) {
+            PN.setIncomingBlock(i, codeReplacer);
+            IncomingCodeReplacerVal = PN.getIncomingValue(i);
+          } else
+            assert(IncomingCodeReplacerVal == PN.getIncomingValue(i) &&
+                   "PHI has two incompatbile incoming values from codeRepl");
+        }
+      }
+
+    for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+      Value *load = Reloads[i];
+      std::vector<User *> Users(outputs[i]->user_begin(),
+                                outputs[i]->user_end());
+      for (unsigned u = 0, e = Users.size(); u != e; ++u) {
+        Instruction *inst = cast<Instruction>(Users[u]);
+        if (inst->getParent()->getParent() == oldFunction)
+          inst->replaceUsesOfWith(outputs[i], load);
+      }
     }
   }
 
diff --git a/llvm/test/tools/llvm-extract/extract-block-cleanup.ll b/llvm/test/tools/llvm-extract/extract-block-cleanup.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-extract/extract-block-cleanup.ll
@@ -0,0 +1,116 @@
+; RUN: llvm-extract -S -bb "foo:region_start;extractonly;cleanup;fallback;region_end" --replace-with-call %s | FileCheck %s
+
+
+; CHECK-LABEL: define void @foo(i32* %arg, i1 %c) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c, label %codeRepl, label %outsideonly
+; CHECK-EMPTY:
+; CHECK-NEXT:  outsideonly:
+; CHECK-NEXT:    store i32 0, i32* %arg, align 4
+; CHECK-NEXT:    br label %cleanup
+; CHECK-EMPTY:
+; CHECK-NEXT:  codeRepl:
+; CHECK-NEXT:    %targetBlock = call i1 @foo.region_start(i32* %arg)
+; CHECK-NEXT:    br i1 %targetBlock, label %cleanup.return_crit_edge, label %region_end.split
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_start:
+; CHECK-NEXT:    br label %extractonly
+; CHECK-EMPTY:
+; CHECK-NEXT:  extractonly:
+; CHECK-NEXT:    store i32 1, i32* %arg, align 4
+; CHECK-NEXT:    br label %cleanup
+; CHECK-EMPTY:
+; CHECK-NEXT:  cleanup:
+; CHECK-NEXT:    %dest = phi i8 [ 0, %outsideonly ], [ 1, %extractonly ]
+; CHECK-NEXT:    switch i8 %dest, label %fallback [
+; CHECK-NEXT:      i8 0, label %cleanup.return_crit_edge
+; CHECK-NEXT:      i8 1, label %region_end
+; CHECK-NEXT:    ]
+; CHECK-EMPTY:
+; CHECK-NEXT:  cleanup.return_crit_edge:
+; CHECK-NEXT:    br label %return
+; CHECK-EMPTY:
+; CHECK-NEXT:  fallback:
+; CHECK-NEXT:    unreachable
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_end:
+; CHECK-NEXT:    br label %region_end.split
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_end.split:
+; CHECK-NEXT:    br label %return
+; CHECK-EMPTY:
+; CHECK-NEXT:  outsidecont:
+; CHECK-NEXT:    br label %return
+; CHECK-EMPTY:
+; CHECK-NEXT:  return:
+; CHECK-NEXT:    ret void
+; CHECK-NEXT:  }
+
+
+; CHECK-LABEL: define internal i1 @foo.region_start(i32* %arg) {
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label %region_start
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_start:
+; CHECK-NEXT:    br label %extractonly
+; CHECK-EMPTY:
+; CHECK-NEXT:  extractonly:
+; CHECK-NEXT:    store i32 1, i32* %arg, align 4
+; CHECK-NEXT:    br label %cleanup
+; CHECK-EMPTY:
+; CHECK-NEXT:  cleanup:
+; CHECK-NEXT:    %dest = phi i8 [ 1, %extractonly ]
+; CHECK-NEXT:    switch i8 %dest, label %fallback [
+; CHECK-NEXT:      i8 0, label %cleanup.return_crit_edge.exitStub
+; CHECK-NEXT:      i8 1, label %region_end
+; CHECK-NEXT:    ]
+; CHECK-EMPTY:
+; CHECK-NEXT:  fallback:
+; CHECK-NEXT:    unreachable
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_end:
+; CHECK-NEXT:    br label %region_end.split.exitStub
+; CHECK-EMPTY:
+; CHECK-NEXT:  cleanup.return_crit_edge.exitStub:
+; CHECK-NEXT:    ret i1 true
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_end.split.exitStub:
+; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:  }
+
+
+
+define void @foo(i32* %arg, i1 %c) {
+entry:
+  br i1 %c, label %region_start, label %outsideonly
+
+outsideonly:
+  store i32 0, i32* %arg, align 4
+  br label %cleanup
+
+region_start:
+  br label %extractonly
+
+extractonly:
+  store i32 1, i32* %arg, align 4
+  br label %cleanup
+
+cleanup:
+  %dest = phi i8 [0, %outsideonly], [1, %extractonly]
+  switch i8 %dest, label %fallback [
+    i8 0, label %return
+    i8 1, label %region_end
+  ]
+
+fallback:
+  unreachable
+
+region_end:
+  br label %return
+
+outsidecont:
+  br label %return
+
+return:
+  ret void
+}
diff --git a/llvm/test/tools/llvm-extract/extract-block-multiple-exits.ll b/llvm/test/tools/llvm-extract/extract-block-multiple-exits.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-extract/extract-block-multiple-exits.ll
@@ -0,0 +1,200 @@
+; RUN: llvm-extract -S -bb "func:region_start;exiting0;exiting1" --replace-with-call %s | FileCheck %s
+
+
+; CHECK-LABEL: define void @func(i32* %arg, i1 %c0, i1 %c1, i1 %c2, i8 %dest) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %B.ce.loc = alloca i32, align 4
+; CHECK-NEXT:    %c.loc = alloca i32, align 4
+; CHECK-NEXT:    %b.loc = alloca i32, align 4
+; CHECK-NEXT:    %a.loc = alloca i32, align 4
+; CHECK-NEXT:    br i1 %c0, label %codeRepl, label %exit
+; CHECK-EMPTY:
+; CHECK-NEXT:  codeRepl:                                        
+; CHECK-NEXT:    %lt.cast = bitcast i32* %a.loc to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* %lt.cast)
+; CHECK-NEXT:    %lt.cast1 = bitcast i32* %b.loc to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* %lt.cast1)
+; CHECK-NEXT:    %lt.cast2 = bitcast i32* %c.loc to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* %lt.cast2)
+; CHECK-NEXT:    %lt.cast3 = bitcast i32* %B.ce.loc to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* %lt.cast3)
+; CHECK-NEXT:    %targetBlock = call i16 @func.region_start(i1 %c1, i1 %c2, i8 %dest, i32* %a.loc, i32* %b.loc, i32* %c.loc, i32* %B.ce.loc)
+; CHECK-NEXT:    %a.reload = load i32, i32* %a.loc, align 4
+; CHECK-NEXT:    %b.reload = load i32, i32* %b.loc, align 4
+; CHECK-NEXT:    %c.reload = load i32, i32* %c.loc, align 4
+; CHECK-NEXT:    %B.ce.reload = load i32, i32* %B.ce.loc, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* %lt.cast)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* %lt.cast1)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* %lt.cast2)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* %lt.cast3)
+; CHECK-NEXT:    switch i16 %targetBlock, label %exit0 [
+; CHECK-NEXT:      i16 0, label %exiting0.exit_crit_edge
+; CHECK-NEXT:      i16 1, label %fallback
+; CHECK-NEXT:      i16 2, label %exit1
+; CHECK-NEXT:      i16 3, label %exit2
+; CHECK-NEXT:    ]
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_start:                                
+; CHECK-NEXT:    %a = add i32 42, 1
+; CHECK-NEXT:    br i1 %c1, label %exiting0, label %exiting1
+; CHECK-EMPTY:
+; CHECK-NEXT:  exiting0:                                     
+; CHECK-NEXT:    %b = add i32 42, 2
+; CHECK-NEXT:    br i1 %c2, label %exiting0.exit_crit_edge, label %exit0.split
+; CHECK-EMPTY:
+; CHECK-NEXT:  exiting0.exit_crit_edge:                         
+; CHECK-NEXT:    %b.merge_with_extracted7 = phi i32 [ %b.reload, %codeRepl ], [ %b, %exiting0 ]
+; CHECK-NEXT:    br label %exit
+; CHECK-EMPTY:
+; CHECK-NEXT:  exiting1:                                        
+; CHECK-NEXT:    %c = add i32 42, 3
+; CHECK-NEXT:    switch i8 %dest, label %fallback [
+; CHECK-NEXT:      i8 0, label %exit0.split
+; CHECK-NEXT:      i8 1, label %exit1
+; CHECK-NEXT:      i8 2, label %exit2
+; CHECK-NEXT:      i8 3, label %exit0.split
+; CHECK-NEXT:    ]
+; CHECK-EMPTY:
+; CHECK-NEXT:  fallback:                                        
+; CHECK-NEXT:    unreachable
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit:                                          
+; CHECK-NEXT:    %A = phi i32 [ 42, %entry ], [ %b.merge_with_extracted7, %exiting0.exit_crit_edge ]
+; CHECK-NEXT:    store i32 %A, i32* %arg, align 4
+; CHECK-NEXT:    br label %return
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit0.split:                                     
+; CHECK-NEXT:    %b.merge_with_extracted6 = phi i32 [ %b, %exiting0 ], [ undef, %exiting1 ], [ undef, %exiting1 ]
+; CHECK-NEXT:    %B.ce = phi i32 [ %b, %exiting0 ], [ %a, %exiting1 ], [ %a, %exiting1 ]
+; CHECK-NEXT:    br label %exit0
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit0:                                         
+; CHECK-NEXT:    %B.ce.merge_with_extracted = phi i32 [ %B.ce.reload, %codeRepl ], [ %B.ce, %exit0.split ]
+; CHECK-NEXT:    %b.merge_with_extracted = phi i32 [ %b.reload, %codeRepl ], [ %b.merge_with_extracted6, %exit0.split ]
+; CHECK-NEXT:    %a.merge_with_extracted5 = phi i32 [ %a.reload, %codeRepl ], [ %a, %exit0.split ]
+; CHECK-NEXT:    store i32 %a.merge_with_extracted5, i32* %arg, align 4
+; CHECK-NEXT:    store i32 %B.ce.merge_with_extracted, i32* %arg, align 4
+; CHECK-NEXT:    br label %after
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit1:                                         
+; CHECK-NEXT:    %c.merge_with_extracted8 = phi i32 [ %c.reload, %codeRepl ], [ %c, %exiting1 ]
+; CHECK-NEXT:    %a.merge_with_extracted4 = phi i32 [ %a.reload, %codeRepl ], [ %a, %exiting1 ]
+; CHECK-NEXT:    br label %after
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit2:                                         
+; CHECK-NEXT:    %c.merge_with_extracted = phi i32 [ %c.reload, %codeRepl ], [ %c, %exiting1 ]
+; CHECK-NEXT:    store i32 %c.merge_with_extracted, i32* %arg, align 4
+; CHECK-NEXT:    store i32 %c.merge_with_extracted, i32* %arg, align 4
+; CHECK-NEXT:    br label %return
+; CHECK-EMPTY:
+; CHECK-NEXT:  after:                                         
+; CHECK-NEXT:    %a.merge_with_extracted = phi i32 [ %a.merge_with_extracted5, %exit0 ], [ %a.merge_with_extracted4, %exit1 ]
+; CHECK-NEXT:    %D = phi i32 [ %b.merge_with_extracted, %exit0 ], [ %c.merge_with_extracted8, %exit1 ]
+; CHECK-NEXT:    store i32 %a.merge_with_extracted, i32* %arg, align 4
+; CHECK-NEXT:    store i32 %D, i32* %arg, align 4
+; CHECK-NEXT:    br label %return
+; CHECK-EMPTY:
+; CHECK-NEXT:  return:                                         
+; CHECK-NEXT:    ret void
+; CHECK-NEXT:  }
+
+
+; CHECK-LABEL: define internal i16 @func.region_start(i1 %c1, i1 %c2, i8 %dest, i32* %a.out, i32* %b.out, i32* %c.out, i32* %B.ce.out) {
+; CHECK-NEXT:  newFuncRoot:                                    
+; CHECK-NEXT:    br label %region_start
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_start:                                   
+; CHECK-NEXT:    %a = add i32 42, 1
+; CHECK-NEXT:    store i32 %a, i32* %a.out, align 4
+; CHECK-NEXT:    br i1 %c1, label %exiting0, label %exiting1
+; CHECK-EMPTY:
+; CHECK-NEXT:  exiting0:                                     
+; CHECK-NEXT:    %b = add i32 42, 2
+; CHECK-NEXT:    store i32 %b, i32* %b.out, align 4
+; CHECK-NEXT:    br i1 %c2, label %exiting0.exit_crit_edge.exitStub, label %exit0.split
+; CHECK-EMPTY:
+; CHECK-NEXT:  exiting1:                                     
+; CHECK-NEXT:    %c = add i32 42, 3
+; CHECK-NEXT:    store i32 %c, i32* %c.out, align 4
+; CHECK-NEXT:    switch i8 %dest, label %fallback.exitStub [
+; CHECK-NEXT:      i8 0, label %exit0.split
+; CHECK-NEXT:      i8 1, label %exit1.exitStub
+; CHECK-NEXT:      i8 2, label %exit2.exitStub
+; CHECK-NEXT:      i8 3, label %exit0.split
+; CHECK-NEXT:    ]
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit0.split:                                    
+; CHECK-NEXT:    %B.ce = phi i32 [ %b, %exiting0 ], [ %a, %exiting1 ], [ %a, %exiting1 ]
+; CHECK-NEXT:    store i32 %B.ce, i32* %B.ce.out, align 4
+; CHECK-NEXT:    br label %exit0.exitStub
+; CHECK-EMPTY:
+; CHECK-NEXT:  exiting0.exit_crit_edge.exitStub:                
+; CHECK-NEXT:    ret i16 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  fallback.exitStub:                             
+; CHECK-NEXT:    ret i16 1
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit1.exitStub:                                
+; CHECK-NEXT:    ret i16 2
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit2.exitStub:                               
+; CHECK-NEXT:    ret i16 3
+; CHECK-EMPTY:
+; CHECK-NEXT:  exit0.exitStub:                                
+; CHECK-NEXT:    ret i16 4
+; CHECK-NEXT:  }
+
+
+define void @func(i32* %arg, i1 %c0, i1 %c1, i1 %c2, i8 %dest) {
+entry:
+  br i1 %c0, label %region_start, label %exit
+
+region_start:
+  %a = add i32 42, 1
+  br i1 %c1, label %exiting0, label %exiting1
+
+exiting0:
+  %b = add i32 42, 2
+  br i1 %c2, label %exit, label %exit0
+
+exiting1:
+  %c = add i32 42, 3
+  switch i8 %dest, label %fallback [
+    i8 0, label %exit0
+    i8 1, label %exit1
+    i8 2, label %exit2
+    i8 3, label %exit0
+  ]
+
+fallback:
+  unreachable
+
+exit:
+  %A = phi i32 [ 42, %entry ], [ %b, %exiting0 ]
+  store i32 %A, i32* %arg
+  br label %return
+
+exit0:
+  %B = phi i32 [ %b, %exiting0 ], [ %a, %exiting1 ] , [ %a, %exiting1 ]
+  store i32 %a, i32* %arg
+  store i32 %B, i32* %arg
+  br label %after
+
+exit1:
+  br label %after
+
+exit2:
+  %C = phi i32 [ %c, %exiting1 ]
+  store i32 %c, i32* %arg
+  store i32 %C, i32* %arg
+  br label %return
+
+after:
+  %D = phi i32 [ %b, %exit0 ], [ %c, %exit1 ]
+  store i32 %a, i32* %arg
+  store i32 %D, i32* %arg
+  br label %return
+
+return:
+  ret void
+}
diff --git a/llvm/test/tools/llvm-extract/extract-block-sink.ll b/llvm/test/tools/llvm-extract/extract-block-sink.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-extract/extract-block-sink.ll
@@ -0,0 +1,67 @@
+; RUN: llvm-extract -S -bb "foo:region_start" --replace-with-call %s | FileCheck %s
+
+
+
+; CHECK-LABEL: define void @foo() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %a = alloca i32, align 4
+; CHECK-NEXT:    %b = alloca i32, align 4
+; CHECK-NEXT:    br label %codeRepl
+; CHECK-EMPTY:
+; CHECK-NEXT:  codeRepl:
+; CHECK-NEXT:    call void @foo.region_start(i32* %b)
+; CHECK-NEXT:    br label %return
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_start:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i32(i64 4, i32* nonnull %a)
+; CHECK-NEXT:    store i32 43, i32* %a, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i32(i64 4, i32* nonnull %a)
+; CHECK-NEXT:    store i32 44, i32* %b, align 4
+; CHECK-NEXT:    br label %return
+; CHECK-EMPTY:
+; CHECK-NEXT:  return:
+; CHECK-NEXT:    ret void
+; CHECK-NEXT:  }
+
+
+; CHECK-LABEL: define internal void @foo.region_start(i32* %b) {
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    %a = alloca i32, align 4
+; CHECK-NEXT:    br label %region_start
+; CHECK-EMPTY:
+; CHECK-NEXT:  region_start:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i32(i64 4, i32* nonnull %a)
+; CHECK-NEXT:    store i32 43, i32* %a, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i32(i64 4, i32* nonnull %a)
+; CHECK-NEXT:    store i32 44, i32* %b, align 4
+; CHECK-NEXT:    br label %return.exitStub
+; CHECK-EMPTY:
+; CHECK-NEXT:  return.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK-NEXT:  }
+
+
+
+
+
+
+declare void @llvm.lifetime.start.p0i32(i64, i32* nocapture)
+declare void @llvm.lifetime.end.p0i32(i64, i32* nocapture)
+
+
+define void @foo() {
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %region_start
+
+region_start:
+  call void @llvm.lifetime.start.p0i32(i64 4, i32* nonnull %a)
+  store i32 43, i32* %a
+  call void @llvm.lifetime.end.p0i32(i64 4, i32* nonnull %a)
+  store i32 44, i32* %b
+  br label %return
+
+return:
+  ret void
+}
diff --git a/llvm/test/tools/llvm-extract/extract-block.ll b/llvm/test/tools/llvm-extract/extract-block.ll
--- a/llvm/test/tools/llvm-extract/extract-block.ll
+++ b/llvm/test/tools/llvm-extract/extract-block.ll
@@ -1,4 +1,6 @@
-; RUN: llvm-extract -S -bb foo:bb4 %s | FileCheck %s
+; RUN: llvm-extract -S -bb foo:bb4                     %s | FileCheck %s --check-prefixes=CHECK,KILL
+; RUN: llvm-extract -S -bb foo:bb4 --replace-with-call %s | FileCheck %s --check-prefixes=CHECK,KEEP
+
 
 ; CHECK: declare void @bar()
 define void @bar() {
@@ -12,7 +14,11 @@
   ret void
 }
 
-; CHECK: @foo.bb4
+; KEEP-LABEL: define i32 @foo(i32 %arg) {
+; KEEP:       call void @foo.bb4
+
+; KILL-LABEL: define dso_local void @foo.bb4(
+; KEEP-LABEL: define internal void @foo.bb4(
 ; CHECK: call void @bar()
 ; CHECK: %tmp5
 define i32 @foo(i32 %arg) {
diff --git a/llvm/test/tools/llvm-extract/extract-blocks-with-groups.ll b/llvm/test/tools/llvm-extract/extract-blocks-with-groups.ll
--- a/llvm/test/tools/llvm-extract/extract-blocks-with-groups.ll
+++ b/llvm/test/tools/llvm-extract/extract-blocks-with-groups.ll
@@ -1,10 +1,19 @@
-; RUN: llvm-extract -bb 'foo:if;then;else' -bb 'bar:bb14;bb20' -S %s  | FileCheck %s
+; RUN: llvm-extract -bb 'foo:if;then;else' -bb 'bar:bb14;bb20'                     -S %s | FileCheck %s --check-prefixes=CHECK,KILL
+; RUN: llvm-extract -bb 'foo:if;then;else' -bb 'bar:bb14;bb20' --replace-with-call -S %s | FileCheck %s --check-prefixes=CHECK,KEEP
 ; Extract two groups of basic blocks in two different functions.
 
 
+; KEEP-LABEL: define i32 @foo(i32 %arg, i32 %arg1) {
+; KEEP:         call void @foo.if.split(
+
+; KEEP-LABEL: define i32 @bar(i32 %arg, i32 %arg1) {
+; KEEP:         %targetBlock = call i1 @bar.bb14(
+
+
 ; The first extracted function is the region composed by the
 ; blocks if, then, and else from foo.
-; CHECK: define dso_local void @foo.if.split(i32 %arg1, i32 %arg, i32* %tmp.0.ce.out) {
+; KILL-LABEL: define dso_local void @foo.if.split(i32 %arg1, i32 %arg, i32* %tmp.0.ce.out) {
+; KEEP-LABEL: define internal void @foo.if.split(i32 %arg1, i32 %arg, i32* %tmp.0.ce.out) {
 ; CHECK: newFuncRoot:
 ; CHECK:   br label %if.split
 ;
@@ -25,7 +34,7 @@
 ; CHECK:   %or.cond = and i1 %tmp5, %tmp8
 ; CHECK:   br i1 %or.cond, label %then, label %else
 ;
-; CHECK: end.split:                                        ; preds = %then, %else
+; CHECK: end.split:
 ; CHECK:   %tmp.0.ce = phi i32 [ %tmp13, %then ], [ %tmp25, %else ]
 ; CHECK:   store i32 %tmp.0.ce, i32* %tmp.0.ce.out
 ; CHECK:   br label %end.exitStub
@@ -36,7 +45,8 @@
 
 ; The second extracted function is the region composed by the blocks
 ; bb14 and bb20 from bar.
-; CHECK: define dso_local i1 @bar.bb14(i32 %arg1, i32 %arg, i32* %tmp25.out) {
+; KILL-LABEL: define dso_local i1 @bar.bb14(i32 %arg1, i32 %arg, i32* %tmp25.out) {
+; KEEP-LABEL: define internal i1 @bar.bb14(i32 %arg1, i32 %arg, i32* %tmp25.out) {
 ; CHECK: newFuncRoot:
 ; CHECK:   br label %bb14
 ;
@@ -50,12 +60,14 @@
 ; CHECK:   %tmp24 = sdiv i32 %arg1, 6
 ; CHECK:   %tmp25 = add nsw i32 %tmp24, %tmp22
 ; CHECK:   store i32 %tmp25, i32* %tmp25.out
-; CHECK:   br label %bb30.exitStub
+; KILL:    br label %bb30.exitStub
+; KEEP:    br label %bb20.split.exitStub
 ;
 ; CHECK: bb26.exitStub:                                    ; preds = %bb14
 ; CHECK:   ret i1 true
 ;
-; CHECK: bb30.exitStub:                                    ; preds = %bb20
+; KILL:  bb30.exitStub:                                    ; preds = %bb20
+; KEEP:  bb20.split.exitStub:
 ; CHECK:   ret i1 false
 ; CHECK: }
 
diff --git a/llvm/tools/llvm-extract/llvm-extract.cpp b/llvm/tools/llvm-extract/llvm-extract.cpp
--- a/llvm/tools/llvm-extract/llvm-extract.cpp
+++ b/llvm/tools/llvm-extract/llvm-extract.cpp
@@ -85,14 +85,26 @@
         "Specify <function, basic block1[;basic block2...]> pairs to extract.\n"
         "Each pair will create a function.\n"
         "If multiple basic blocks are specified in one pair,\n"
-        "the first block in the sequence should dominate the rest.\n"
+        "the first block in the sequence should dominate the rest (Unless "
+        "using --bb-keep-blocks).\n"
         "eg:\n"
         "  --bb=f:bb1;bb2 will extract one function with both bb1 and bb2;\n"
         "  --bb=f:bb1 --bb=f:bb2 will extract two functions, one with bb1, one "
         "with bb2."),
     cl::ZeroOrMore, cl::value_desc("function:bb1[;bb2...]"),
+    cl::cat(ExtractCat))
+    ;
+
+
+static cl::opt<bool> ReplaceWithCall(
+    "replace-with-call",
+    cl::desc(
+        "When extracting blocks from functions, keep the original functions; "
+        "extracted code is replaced by function call to new function"),
     cl::cat(ExtractCat));
 
+
+
 // ExtractAlias - The alias to extract from the module.
 static cl::list<std::string>
     ExtractAliases("alias", cl::desc("Specify alias to extract"),
@@ -359,7 +371,8 @@
     }
 
     legacy::PassManager PM;
-    PM.add(createBlockExtractorPass(GroupOfBBs, true));
+    PM.add(createBlockExtractorPass(GroupOfBBs,  !ReplaceWithCall, ReplaceWithCall));
+    // TODO: Remove BBs from original function that have become dead.
     PM.run(*M);
   }