Index: llvm/include/llvm/Transforms/IPO/IROutliner.h
===================================================================
--- llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -95,6 +95,10 @@
   /// required for the following basic blocks in this case.
   bool EndsInBranch = false;
 
+  /// The PHIBlocks with their corresponding return block based on the return
+  /// value as the key.
+  DenseMap<Value *, BasicBlock *> PHIBlocks;
+
   /// Mapping of the argument number in the deduplicated function
   /// to a given constant, which is used when creating the arguments to the call
   /// to the newly created deduplicated function.  This is handled separately
@@ -182,7 +186,14 @@
   IROutliner(function_ref<TargetTransformInfo &(Function &)> GTTI,
              function_ref<IRSimilarityIdentifier &(Module &)> GIRSI,
              function_ref<OptimizationRemarkEmitter &(Function &)> GORE)
-      : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {}
+      : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {
+    
+    // Check that the DenseMap implementation has not changed.
+    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
+           "DenseMapInfo<unsigned>'s empty key isn't -1!");
+    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
+           "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+  }
   bool run(Module &M);
 
 private:
Index: llvm/lib/Transforms/IPO/IROutliner.cpp
===================================================================
--- llvm/lib/Transforms/IPO/IROutliner.cpp
+++ llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -104,6 +104,16 @@
   /// of the region.
   unsigned BranchesToOutside = 0;
 
+  /// Tracker counting backwards from the highest unsigned value possible to
+  /// avoid conflicting with the GVNs of assigned values.  We start at -3 since
+  /// -2 and -1 are assigned by the DenseMap.
+  unsigned PHINodeGVNTracker = -3;
+
+  DenseMap<unsigned,
+           std::pair<std::pair<unsigned, unsigned>, SmallVector<unsigned, 2>>>
+      PHINodeGVNToGVNs;
+  DenseMap<hash_code, unsigned> GVNsToPHINodeGVN;
+
   /// The number of instructions that will be outlined by extracting \ref
   /// Regions.
   InstructionCost Benefit = 0;
@@ -358,6 +368,22 @@
   return Benefit;
 }
 
+/// Check the \p OutputMappings structure for value \p Input, if it exists
+/// it has been used as an output for outlining, and has been renamed, and we
+/// return the new value, otherwise, we return the same value.
+///
+/// \param OutputMappings [in] - The mapping of values to their renamed value
+/// after being used as an output for an outlined region.
+/// \param Input [in] - The value to find the remapped value of, if it exists.
+/// \return The remapped value if it has been renamed, and the same value if has
+/// not.
+static Value *findOutputMapping(const DenseMap<Value *, Value *> OutputMappings,
+                                Value *Input) {
+  if (OutputMappings.find(Input) != OutputMappings.end())
+    return OutputMappings.find(Input)->second;
+  return Input;
+}
+
 /// Find whether \p Region matches the global value numbering to Constant
 /// mapping found so far.
 ///
@@ -838,6 +864,209 @@
   Region.NumExtractedInputs = OriginalIndex;
 }
 
+/// Check if the \p V has any uses outside of the region other than \p PN.
+///
+/// \param V [in] - The value to check.
+/// \param PHILoc [in] - The location in the PHINode of \p V.
+/// \param PN [in] - The PHINode using \p V.
+/// \param Exits [in] - The potential blocks we exit to from the outlined
+/// region.
+/// \param BlocksInRegion [in] - The basic blocks contained in the region.
+/// \returns true if \p V has any use soutside its region other than \p PN.
+static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN,
+                            SmallPtrSet<BasicBlock *, 1> &Exits,
+                            DenseSet<BasicBlock *> &BlocksInRegion) {
+  // We check to see if the value is used by the PHINode from some other
+  // predecessor not included in the region.  If it is, we make sure
+  // to keep it as an output.
+  SmallVector<unsigned, 2> IncomingNumbers(PN.getNumIncomingValues());
+  std::iota(IncomingNumbers.begin(), IncomingNumbers.end(), 0);
+  if (any_of(IncomingNumbers, [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) {
+        return (Idx != PHILoc && V == PN.getIncomingValue(Idx) &&
+                !BlocksInRegion.contains(PN.getIncomingBlock(Idx)));
+      }))
+    return true;
+
+  // Check if the value is used by any other instructions outside the region.
+  return any_of(V->users(), [&Exits, &BlocksInRegion](User *U) {
+    Instruction *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+
+    // If the use of the item is inside the region, we skip it.  Uses
+    // inside the region give us useful information about how the item could be
+    // used as an output.
+    BasicBlock *Parent = I->getParent();
+    if (BlocksInRegion.contains(Parent))
+      return false;
+
+    // If it's not a PHINode then we definitely know the use matters.  This
+    // output value will not completely combined with another item in a PHINode
+    // as it is directly reference by another non-phi instruction
+    if (!isa<PHINode>(I))
+      return true;
+
+    // If we have a PHINode outside one of the exit locations, then it
+    // can be considered an outside use as well.  If there is a PHINode
+    // contained in the Exit where this values use matters, it will be
+    // caught when we analyze that PHINode.
+    if (!Exits.contains(Parent))
+      return true;
+
+    return false;
+  });
+}
+
+/// Test whether \p CurrentExitFromRegion contains any PhiNodes that should be
+/// considered outputs. A PHINodes is an output when more than one incoming
+/// value has been marked by the CodeExtractor as an output.
+///
+/// \param CurrentExitFromRegion [in] - The block to analyze.
+/// \param PotentialExitsFromRegion [in] - The potential exit blocks from the
+/// region.
+/// \param RegionBlocks [in] - The basic blocks in the region.
+/// \param Outputs [in, out] - The existing outputs for the region, we may add
+/// PHINodes to this as we find that they replace output values.
+/// \param OutputsReplacedByPHINode [out] - A set containing outputs that are
+/// totally replaced  by a PHINode.
+/// \param OutputsWithNonPhiUses [out] - A set containing outputs that are used
+/// in PHINodes, but have other uses, and should still be considered outputs.
+static void analyzeExitPHIsForOutputUses(
+    BasicBlock *CurrentExitFromRegion,
+    SmallPtrSet<BasicBlock *, 1> &PotentialExitsFromRegion,
+    DenseSet<BasicBlock *> &RegionBlocks, SetVector<Value *> &Outputs,
+    DenseSet<Value *> &OutputsReplacedByPHINode,
+    DenseSet<Value *> &OutputsWithNonPhiUses) {
+  for (PHINode &PN : CurrentExitFromRegion->phis()) {
+    // Find all incoming values from the outlining region.
+    SmallVector<unsigned, 2> IncomingVals;
+    for (unsigned I = 0, E = PN.getNumIncomingValues(); I < E; ++I)
+      if (RegionBlocks.contains(PN.getIncomingBlock(I)))
+        IncomingVals.push_back(I);
+
+    // Do not process PHI if there are no predecessors from region.
+    unsigned NumIncomingVals = IncomingVals.size();
+    if (NumIncomingVals == 0)
+      continue;
+
+    // If there is one predecessor, we mark it as a value that needs to be kept
+    // as an output.
+    if (NumIncomingVals == 1) {
+      Value *V = PN.getIncomingValue(*IncomingVals.begin());
+      OutputsWithNonPhiUses.insert(V);
+      OutputsReplacedByPHINode.erase(V);
+      continue;
+    }
+
+    // This PHINode will be used as an output value, so we add it to our list.
+    Outputs.insert(&PN);
+
+    // Not all of the incoming values should be ignored as other inputs and
+    // outputs may have uses in outlined region.  If they have other uses
+    // outside of the single PHINode we should not skip over it.
+    for (unsigned Idx : IncomingVals) {
+      Value *V = PN.getIncomingValue(Idx);
+      if (outputHasNonPHI(V, Idx, PN, PotentialExitsFromRegion, RegionBlocks)) {
+        OutputsWithNonPhiUses.insert(V);
+        OutputsReplacedByPHINode.erase(V);
+        continue;
+      }
+      if (!OutputsWithNonPhiUses.contains(V))
+        OutputsReplacedByPHINode.insert(V);
+    }
+  }
+}
+
+// Represents the type for the unsigned number denoting the output number for
+// phi node, along with the canonical number for the exit block.
+using ArgLocWithBBCanon = std::pair<unsigned, unsigned>;
+// The list of canonical numbers for the incoming values to a PHINode.
+using CanonList = SmallVector<unsigned, 2>;
+// The pair type representing the set of canonical values being combined in the
+// PHINode, along with the location data for the PHINode.
+using PHINodeData = std::pair<ArgLocWithBBCanon, CanonList>;
+
+/// Encode \p PND as an integer for easy lookup based on the argument location,
+/// the parent BasicBlock canonical numbering, and the canonical numbering of
+/// the values stored in the PHINode.
+///
+/// \param PND - The data to hash.
+/// \returns The hash code of \p PND.
+static hash_code encodePHINodeData(PHINodeData &PND) {
+  return llvm::hash_combine(
+      llvm::hash_value(PND.first.first), llvm::hash_value(PND.first.second),
+      llvm::hash_combine_range(PND.second.begin(), PND.second.end()));
+}
+
+/// Create a special GVN for PHINodes that will be used outside of
+/// the region.  We create a hash code based on the Canonical number of the
+/// parent BasicBlock, the canonical numbering of the values stored in the
+/// PHINode and the aggregate argument location.  This is used to find whether
+/// this PHINode type has been given a canonical numbering already.  If not, we
+/// assign it a value and store it for later use.  The value is returned to
+/// identify different output schemes for the set of regions.
+///
+/// \param Region - The region that \p PN is an output for.
+/// \param PN - The PHINode we are analyzing.
+/// \param AggArgIdx - The argument \p PN will be stored into.
+/// \returns An optional holding the assigned canonical number, or None if
+/// there is some attribute of the PHINode blocking it from being used.
+static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region,
+                                           PHINode *PN, unsigned AggArgIdx) {
+  OutlinableGroup &Group = *Region.Parent;
+  IRSimilarityCandidate &Cand = *Region.Candidate;
+  BasicBlock *PHIBB = PN->getParent();
+  CanonList PHIGVNs;
+  for (Value *Incoming : PN->incoming_values()) {
+    // If we cannot find a GVN, this means that the input to the PHINode is
+    // not included in the region we are trying to analyze, meaning, that if
+    // it was outlined, we would be adding an extra input.  We ignore this
+    // case for now, and so ignore the region.
+    Optional<unsigned> OGVN = Cand.getGVN(Incoming);
+    if (!OGVN.hasValue()) {
+      Region.IgnoreRegion = true;
+      return None;
+    }
+
+    // Collect the canonical numbers of the values in the PHINode.
+    unsigned GVN = OGVN.getValue();
+    OGVN = Cand.getCanonicalNum(GVN);
+    assert(OGVN.hasValue() && "No GVN found for incoming value?");
+    PHIGVNs.push_back(*OGVN);
+  }
+
+  // Now that we have the GVNs for the incoming values, we are going to combine
+  // them with the GVN of the incoming bock, and the output location of the
+  // PHINode to generate a hash value representing this instance of the PHINode.
+  DenseMap<hash_code, unsigned>::iterator GVNToPHIIt;
+  DenseMap<unsigned, PHINodeData>::iterator PHIToGVNIt;
+  Optional<unsigned> BBGVN = Cand.getGVN(PHIBB);
+  assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!");
+
+  BBGVN = Cand.getCanonicalNum(BBGVN.getValue());
+  assert(BBGVN.hasValue() &&
+         "Could not find canonical number for the incoming block!");
+  // Create a pair of the exit block canonical value, and the aggregate
+  // argument location, connected to the canonical numbers stored in the
+  // PHINode.
+  PHINodeData TemporaryPair =
+      std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs);
+  hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair);
+
+  // Look for and create a new entry in our connection between canonical
+  // numbers for PHINodes, and the set of objects we just created.
+  GVNToPHIIt = Group.GVNsToPHINodeGVN.find(PHINodeDataHash);
+  if (GVNToPHIIt == Group.GVNsToPHINodeGVN.end()) {
+    bool Inserted = false;
+    std::tie(PHIToGVNIt, Inserted) = Group.PHINodeGVNToGVNs.insert(
+        std::make_pair(Group.PHINodeGVNTracker, TemporaryPair));
+    std::tie(GVNToPHIIt, Inserted) = Group.GVNsToPHINodeGVN.insert(
+        std::make_pair(PHINodeDataHash, Group.PHINodeGVNTracker--));
+  }
+
+  return GVNToPHIIt->second;
+}
+
 /// Create a mapping of the output arguments for the \p Region to the output
 /// arguments of the overall outlined function.
 ///
@@ -850,35 +1079,25 @@
   IRSimilarityCandidate &C = *Region.Candidate;
 
   SmallVector<BasicBlock *> BE;
-  DenseSet<BasicBlock *> BBSet;
-  C.getBasicBlocks(BBSet, BE);
+  DenseSet<BasicBlock *> BlocksInRegion;
+  C.getBasicBlocks(BlocksInRegion, BE);
 
   // Find the exits to the region.
   SmallPtrSet<BasicBlock *, 1> Exits;
   for (BasicBlock *Block : BE)
     for (BasicBlock *Succ : successors(Block))
-      if (!BBSet.contains(Succ))
+      if (!BlocksInRegion.contains(Succ))
         Exits.insert(Succ);
 
   // After determining which blocks exit to PHINodes, we add these PHINodes to
   // the set of outputs to be processed.  We also check the incoming values of
   // the PHINodes for whether they should no longer be considered outputs.
-  for (BasicBlock *ExitBB : Exits) {
-    for (PHINode &PN : ExitBB->phis()) {
-      // Find all incoming values from the outlining region.
-      SmallVector<unsigned, 2> IncomingVals;
-      for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx)
-        if (BBSet.contains(PN.getIncomingBlock(Idx)))
-          IncomingVals.push_back(Idx);
-
-      // Do not process PHI if there is one (or fewer) predecessor from region.
-      if (IncomingVals.size() <= 1)
-        continue;
-
-      Region.IgnoreRegion = true;
-      return;
-    }
-  }
+  DenseSet<Value *> OutputsReplacedByPHINode;
+  DenseSet<Value *> OutputsWithNonPhiUses;
+  for (BasicBlock *ExitBB : Exits)
+    analyzeExitPHIsForOutputUses(ExitBB, Exits, BlocksInRegion, Outputs,
+                                 OutputsReplacedByPHINode,
+                                 OutputsWithNonPhiUses);
 
   // This counts the argument number in the extracted function.
   unsigned OriginalIndex = Region.NumExtractedInputs;
@@ -901,9 +1120,13 @@
     // do not have to be in same order, but are functionally the same, we will
     // have to use a different scheme, as one-to-one correspondence is not
     // guaranteed.
-    unsigned GlobalValue = C.getGVN(Output).getValue();
     unsigned ArgumentSize = Group.ArgumentTypes.size();
 
+    // If the output is combined in a PHINode, we make sure to skip over it.
+    if (OutputsReplacedByPHINode.contains(Output))
+      continue;
+
+    unsigned AggArgIdx = 0;
     for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
       if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
         continue;
@@ -915,7 +1138,7 @@
       AggArgsUsed.insert(Jdx);
       Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
       Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+      AggArgIdx = Jdx;
       break;
     }
 
@@ -924,18 +1147,49 @@
     // function to handle this output and create a mapping to it.
     if (!TypeFound) {
       Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType()));
-      AggArgsUsed.insert(Group.ArgumentTypes.size() - 1);
+      unsigned ArgTypeIdx = Group.ArgumentTypes.size() - 1;
+      AggArgsUsed.insert(ArgTypeIdx);
       Region.ExtractedArgToAgg.insert(
-          std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
+          std::make_pair(OriginalIndex, ArgTypeIdx));
       Region.AggArgToExtracted.insert(
-          std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+          std::make_pair(ArgTypeIdx, OriginalIndex));
+      AggArgIdx = ArgTypeIdx;
     }
 
-    stable_sort(Region.GVNStores);
+    // TODO: Adapt to the extra input from the PHINode.
+    PHINode *PN = dyn_cast<PHINode>(Output);
+
+    Optional<unsigned> GVN;
+    if (PN && !BlocksInRegion.contains(PN->getParent())) {
+      // Values outside the region can be combined into PHINode when we
+      // have multiple exits. We collect both of these into a list to identify
+      // which values are being used in the PHINode. Each list identifies a
+      // different PHINode, and a different output. We store the PHINode as it's
+      // own canonical value.  These canonical values are also dependent on the
+      // output argument it is saved to.
+
+      // If two PHINodes have the same canonical values, but different aggregate
+      // argument locations, then they will have distinct Canonical Values.
+      GVN = getGVNForPHINode(Region, PN, AggArgIdx);
+      if (!GVN.hasValue())
+        return; 
+    } else {
+      GVN = C.getGVN(Output);
+      GVN = C.getCanonicalNum(*GVN);
+    }
+
+    // Each region has a potentially unique set of outputs.  We save which
+    // values are output in a list of canonical values so we can differentiate
+    // among the different store schemes.
+    Region.GVNStores.push_back(*GVN);
+
     OriginalIndex++;
     TypeIndex++;
   }
+
+  // We sort the stored values to make sure that we are not affected by analysis
+  // order when determining what combination of items were stored.
+  stable_sort(Region.GVNStores);
 }
 
 void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
@@ -1071,6 +1325,202 @@
   return Call;
 }
 
+/// Find or create a BasicBlock in the outlined function containing PhiBlocks
+/// for \p RetVal.
+///
+/// \param Group - The OutlinableGroup containing the information about the
+/// overall outlined function.
+/// \param RetVal - The return value or exit option that we are currently
+/// evaluating.
+/// \returns The found or newly created BasicBlock to contain the needed
+/// PHINodes to be used as outputs.
+static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) {
+  DenseMap<Value *, BasicBlock *>::iterator PhiBlockForRetVal,
+      ReturnBlockForRetVal;
+  PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+  ReturnBlockForRetVal = Group.EndBBs.find(RetVal);
+  assert(ReturnBlockForRetVal != Group.EndBBs.end() &&
+         "Could not find output value!");
+  BasicBlock *ReturnBB = ReturnBlockForRetVal->second;
+
+  // Find if a PHIBlock exists for this return value already.  If it is
+  // the first time we are anaylzing this, we will not, so we record it.
+  PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+  if (PhiBlockForRetVal != Group.PHIBlocks.end())
+    return PhiBlockForRetVal->second;
+  
+  // If we did not find a block, we create one, and insert it into the
+  // overall function and record it.
+  BasicBlock *PHIBlock;
+  bool Inserted = false;
+  PHIBlock = BasicBlock::Create(ReturnBB->getContext(), "phi_block",
+                                ReturnBB->getParent());
+  std::tie(PhiBlockForRetVal, Inserted) =
+      Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+
+  // We find the predecessors in the overall function here.
+  SmallVector<BranchInst *, 2> BranchesToChange;
+  for (BasicBlock *Pred : predecessors(ReturnBB))
+    BranchesToChange.push_back(cast<BranchInst>(Pred->getTerminator()));
+
+  // Now we mark the branch instructions found, and change the references of the
+  // return block to the newly created PHIBlock.
+  for (BranchInst *BI : BranchesToChange)
+    for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ < End; Succ++) {
+      if (BI->getSuccessor(Succ) != ReturnBB)
+        continue;
+      BI->setSuccessor(Succ, PHIBlock);
+    }
+
+  BranchInst::Create(ReturnBB, PHIBlock);
+
+  return PhiBlockForRetVal->second;
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \param ReplacedWithOutlinedCall - False when we must adjust the argument
+/// location when we are not analyzing the overall function call, but extracted
+/// function call. If true, we do not have to make this adjustment.
+/// \returns The Value representing \p A at the call location for some outlined
+/// function.
+static Value *getPassedArgument(Argument *A, OutlinableRegion &Region,
+                                bool ReplacedWithOutlinedCall) {
+  CallInst *CI = Region.Call;
+  unsigned ArgNum = A->getArgNo();
+
+  // If we don't need to adjust the argument number at all (since the call
+  // has already been replaced by a call to the overall outlined function)
+  // we can just get the specified argument.
+  if (ReplacedWithOutlinedCall)
+    return CI->getArgOperand(ArgNum);
+  
+  // If it is a constant, we can look at our mapping from when we created
+  // the outputs to figure out what the constant value is.
+  if (Region.AggArgToConstant.count(ArgNum))
+    return Region.AggArgToConstant.find(ArgNum)->second;
+  
+  // If it is not a constant, and we are not looking at the overall function, we
+  // need to adjust which argument we are looking at.
+  ArgNum = Region.AggArgToExtracted.find(ArgNum)->second;
+  return CI->getArgOperand(ArgNum);
+}
+
+/// Find the canonical numbering for the incoming Values into the PHINode \p PN.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \param CanonNums [out] - The canonical numbering for the incoming values to
+/// \p PN.
+/// \param ReplacedWithOutlinedCall - A flag to use the extracted function call
+/// of \p Region rather than the overall function's call.
+static void
+findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region,
+                    const DenseMap<Value *, Value *> &OutputMappings,
+                    DenseSet<unsigned> &CanonNums,
+                    bool ReplacedWithOutlinedCall = true) {
+  // Iterate over the incoming values.
+  for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) {
+    Value *IVal = PN->getIncomingValue(Idx);
+    // If we have an argument as incoming value, we need to grab the passed
+    // value from the call itself.
+    if (Argument *A = dyn_cast<Argument>(IVal))
+      IVal = getPassedArgument(A, Region, ReplacedWithOutlinedCall);
+
+    // Get the original value if it has been replaced by an output value.
+    IVal = findOutputMapping(OutputMappings, IVal);
+
+    // Find and add the canonical number for the incoming value.
+    Optional<unsigned> GVN = Region.Candidate->getGVN(IVal);
+    assert(GVN.hasValue() && "No GVN for incoming value");
+    Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN);
+    assert(CanonNum.hasValue() && "No Canonical Number for GVN");
+    CanonNums.insert(*CanonNum);
+  }
+}
+
+/// Find, or add PHINode \p PN to the combined PHINode Block \p OverallPHIBlock
+/// in order to condense the number of instructions added to the outlined
+/// function.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN. 
+/// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find
+/// \p PN in.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \return the newly found or created PHINode in \p OverallPhiBlock.
+static PHINode*
+findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
+                       BasicBlock *OverallPhiBlock,
+                       const DenseMap<Value *, Value *> &OutputMappings) {
+  OutlinableGroup &Group = *Region.Parent;
+  
+  DenseSet<unsigned> PNCanonNums;
+  // We have to use the extracted function since we have merged this region into
+  // the overall function yet.  We make sure to reassign the argument numbering
+  // since it is possible that the argument ordering is different between the
+  // functions.
+  findCanonNumsForPHI(&PN, Region, OutputMappings, PNCanonNums,
+                      /* ReplacedWithOutlinedCall = */ false);
+
+  OutlinableRegion *FirstRegion = Group.Regions[0];
+  DenseSet<unsigned> CurrentCanonNums;
+  // Find the Canonical Numbering for each PHINode, if it matches, we replace
+  // the uses of the PHINode we are searching for, with the found PHINode.
+  for (PHINode &CurrPN : OverallPhiBlock->phis()) {
+    CurrentCanonNums.clear();
+    findCanonNumsForPHI(&CurrPN, *FirstRegion, OutputMappings, CurrentCanonNums,
+                        true);
+
+    if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) {
+          return CurrentCanonNums.contains(CanonNum);
+        }))
+      return &CurrPN;
+  }
+
+  // If we've made it here, it means we weren't able to replace the PHINode, so
+  // we must insert it ourselves.
+  PHINode *NewPN = cast<PHINode>(PN.clone());
+  NewPN->insertBefore(&*OverallPhiBlock->begin());
+  for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx;
+       Idx++) {
+    Value *IncomingVal = NewPN->getIncomingValue(Idx);
+    BasicBlock *IncomingBlock = NewPN->getIncomingBlock(Idx);
+
+    // Find corresponding basic block in the overall function for the incoming
+    // block.
+    Instruction *FirstNonPHI = IncomingBlock->getFirstNonPHI();
+    assert(FirstNonPHI && "Incoming block is empty?");
+    Value *CorrespondingVal =
+        Region.findCorrespondingValueIn(*FirstRegion, FirstNonPHI);
+    assert(CorrespondingVal && "Value is nullptr?");
+    BasicBlock *BlockToUse = cast<Instruction>(CorrespondingVal)->getParent();
+    NewPN->setIncomingBlock(Idx, BlockToUse);
+
+    // If we have an argument we make sure we replace using the argument from
+    // the correct function.
+    if (Argument *A = dyn_cast<Argument>(IncomingVal)) {
+      unsigned ArgNum = A->getArgNo();
+      Value *Val = Group.OutlinedFunction->getArg(ArgNum);
+      NewPN->setIncomingValue(Idx, Val);
+      continue;
+    }
+    
+    // Find the corresponding value in the overall function.
+    IncomingVal = findOutputMapping(OutputMappings, IncomingVal);
+    Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal);
+    assert(Val && "Value is nullptr?");
+    NewPN->setIncomingValue(Idx, Val);
+  }
+  return NewPN;
+}
+
 // Within an extracted function, replace the argument uses of the extracted
 // region with the arguments of the function for an OutlinableGroup.
 //
@@ -1083,6 +1533,7 @@
 static void
 replaceArgumentUses(OutlinableRegion &Region,
                     DenseMap<Value *, BasicBlock *> &OutputBBs,
+                    const DenseMap<Value *, Value *> &OutputMappings,
                     bool FirstFunction = false) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
@@ -1136,8 +1587,7 @@
       if (!RI)
         continue;
       Value *RetVal = RI->getReturnValue();
-      DenseMap<Value *, BasicBlock *>::iterator VBBIt, PHIVBBIt;
-      VBBIt = OutputBBs.find(RetVal);
+      DenseMap<Value *, BasicBlock *>::iterator VBBIt = OutputBBs.find(RetVal);
       assert(VBBIt != OutputBBs.end() && "Could not find output value!");
 
       // If this is storing a PHINode, we must make sure it is included in the
@@ -1153,12 +1603,46 @@
       LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
                         << *OutputBB << "\n");
 
-      if (FirstFunction)
+      // If this is storing a PHINode, we must make sure it is included in the
+      // overall function.
+      if (!isa<PHINode>(ValueOperand)) {
+        if (FirstFunction)
+          continue;
+        Value *CorrVal =
+            Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
+        assert(CorrVal && "Value is nullptr?");
+        NewI->setOperand(0, CorrVal);
         continue;
-      Value *CorrVal =
-          Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
-      assert(CorrVal && "Value is nullptr?");
-      NewI->setOperand(0, CorrVal);
+      }
+      PHINode *PN = cast<PHINode>(SI->getValueOperand());
+      // If it has a value, it was not split by the code extractor, which
+      // is what we are looking for.
+      if (Region.Candidate->getGVN(PN).hasValue())
+        continue;
+
+      // We record the parent block for the PHINode in the Region so that
+      // we can exclude it from checks later on.
+      Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent()));
+
+      // If this is the first function, we do not need to worry about mergiing
+      // this with any other block in the overall outlined function, so we can
+      // just continue.
+      if (FirstFunction) {
+        BasicBlock *PHIBlock = PN->getParent();
+        Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+        continue;
+      }
+
+      // We look for the aggregate block that contains the PHINodes leading into
+      // this exit path. If we can't find one, we create one.
+      BasicBlock *OverallPhiBlock = findOrCreatePHIBlock(Group, RetVal);
+
+      // For our PHINode, we find the combined canonical numbering, and
+      // attempt to find a matching PHINode in the overall PHIBlock.  If we
+      // cannot, we copy the PHINode and move it into this new block.
+      PHINode *NewPN =
+          findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock, OutputMappings);
+      NewI->setOperand(0, NewPN);
     }
 
     // If we added an edge for basic blocks without a predecessor, we remove it
@@ -1399,7 +1883,12 @@
     Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   // We only need the switch statement if there is more than one store
-  // combination.
+  // combination, or there is more than one set of output blocks.  The first
+  // will occur when we store different sets of values for two different
+  // regions.  The second will occur when we have two outputs that are combined
+  // in a PHINode outside of the region in one outlined instance, and are used
+  // seaparately in another. This will create the same set of OutputGVNs, but
+  // will generate two different output schemes.
   if (OG.OutputGVNCombinations.size() > 1) {
     Function *AggFunc = OG.OutlinedFunction;
     // Create a final block for each different return block.
@@ -1442,8 +1931,14 @@
     return;
   }
 
+  assert(OutputStoreBBs.size() < 2 && "Different store sets not handled!");
+
   // If there needs to be stores, move them from the output blocks to their
-  // corresponding ending block.
+  // corresponding ending block.  We do not check that the OutputGVNCombinations
+  // is equal to 1 here since that could just been the case where there are 0
+  // outputs. Instead, we check whether there is more than one set of output
+  // blocks since this is the only case where we would have to move the
+  // stores, and erase the extraneous blocks.
   if (OutputStoreBBs.size() == 1) {
     LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
                       << *OG.OutlinedFunction << "\n");
@@ -1475,10 +1970,13 @@
 /// set of stores needed for the different functions.
 /// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
+/// \param [in] OutputMappings - Extracted functions to erase from module
+/// once outlining is complete.
 static void fillOverallFunction(
     Module &M, OutlinableGroup &CurrentGroup,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs,
-    std::vector<Function *> &FuncsToRemove) {
+    std::vector<Function *> &FuncsToRemove,
+    const DenseMap<Value *, Value *> &OutputMappings) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
   // Move first extracted function's instructions into new function.
@@ -1499,7 +1997,7 @@
                              CurrentGroup.OutlinedFunction, "output_block_0");
   CurrentOS->OutputBlockNum = 0;
 
-  replaceArgumentUses(*CurrentOS, NewBBs, true);
+  replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings, true);
   replaceConstants(*CurrentOS);
 
   // We first identify if any output blocks are empty, if they are we remove
@@ -1533,7 +2031,8 @@
 
   OutlinableRegion *CurrentOS;
 
-  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
+  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove,
+                      OutputMappings);
 
   std::vector<Value *> SortedKeys;
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
@@ -1547,8 +2046,7 @@
     createAndInsertBasicBlocks(
         CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
         "output_block_" + Twine(static_cast<unsigned>(Idx)));
-
-    replaceArgumentUses(*CurrentOS, NewBBs);
+    replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings);
     alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
                                 CurrentGroup.EndBBs, OutputMappings,
                                 OutputStoreBBs);
@@ -1716,6 +2214,34 @@
   return RegionBenefit;
 }
 
+/// For the \p OutputCanon number passed in find the value represented by this
+/// canonical number. If it is from a PHINode, we pick the first incoming
+/// value and return that Value instead.
+///
+/// \param Region - The OutlinableRegion to get the Value from.
+/// \param OutputCanon - The canonical number to find the Value from.
+/// \returns The Value represented by a canonical number \p OutputCanon in \p
+/// Region.
+static Value *findOutputValueInRegion(OutlinableRegion &Region,
+                                      unsigned OutputCanon) {
+  OutlinableGroup &CurrentGroup = *Region.Parent;
+  // If the value is greater than the value in the tracker, we have a
+  // PHINode and will instead use one of the incoming values to find the
+  // type.
+  if (OutputCanon > CurrentGroup.PHINodeGVNTracker) {
+    auto It = CurrentGroup.PHINodeGVNToGVNs.find(OutputCanon);
+    assert(It != CurrentGroup.PHINodeGVNToGVNs.end() &&
+           "Could not find GVN set for PHINode number!");
+    assert(It->second.second.size() > 0 && "PHINode does not have any values!");
+    OutputCanon = *It->second.second.begin();
+  }
+  Optional<unsigned> OGVN = Region.Candidate->fromCanonicalNum(OutputCanon);
+  assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?");
+  Optional<Value *> OV = Region.Candidate->fromGVN(*OGVN);
+  assert(OV.hasValue() && "Could not find value for GVN?");
+  return *OV;
+}
+
 InstructionCost
 IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
   InstructionCost OverallCost = 0;
@@ -1723,10 +2249,8 @@
     TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
 
     // Each output incurs a load after the call, so we add that to the cost.
-    for (unsigned OutputGVN : Region->GVNStores) {
-      Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN);
-      assert(OV.hasValue() && "Could not find value for GVN?");
-      Value *V = OV.getValue();
+    for (unsigned OutputCanon : Region->GVNStores) {
+      Value *V = findOutputValueInRegion(*Region, OutputCanon);
       InstructionCost LoadCost =
           TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
                               TargetTransformInfo::TCK_CodeSize);
@@ -1755,6 +2279,7 @@
   InstructionCost OutputCost = 0;
   unsigned NumOutputBranches = 0;
 
+  OutlinableRegion &FirstRegion = *CurrentGroup.Regions[0];
   IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
   DenseSet<BasicBlock *> CandidateBlocks;
   Candidate.getBasicBlocks(CandidateBlocks);
@@ -1780,10 +2305,8 @@
 
   for (const ArrayRef<unsigned> &OutputUse :
        CurrentGroup.OutputGVNCombinations) {
-    for (unsigned GVN : OutputUse) {
-      Optional<Value *> OV = Candidate.fromGVN(GVN);
-      assert(OV.hasValue() && "Could not find value for GVN?");
-      Value *V = OV.getValue();
+    for (unsigned OutputCanon : OutputUse) {
+      Value *V = findOutputValueInRegion(FirstRegion, OutputCanon);
       InstructionCost StoreCost =
           TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
                               TargetTransformInfo::TCK_CodeSize);
@@ -2043,8 +2566,8 @@
         continue;
 
       SmallVector<BasicBlock *> BE;
-      DenseSet<BasicBlock *> BBSet;
-      OS->Candidate->getBasicBlocks(BBSet, BE);
+      DenseSet<BasicBlock *> BlocksInRegion;
+      OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
@@ -2154,8 +2677,8 @@
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
       SmallVector<BasicBlock *> BE;
-      DenseSet<BasicBlock *> BBSet;
-      OS->Candidate->getBasicBlocks(BBSet, BE);
+      DenseSet<BasicBlock *> BlocksInRegion;
+      OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
Index: llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do differentiate between outputs of the region stored in PHINodes
+; versus those stored outside of PHINodes.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = add i32 %c, 1
+  %2 = add i32 %e, 1
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32* null, i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[C_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[C_LOC]], i32* [[E_LOC]], i32 1)
+; CHECK-NEXT:    [[C_RELOAD:%.*]] = load i32, i32* [[C_LOC]], align 4
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[C_RELOAD]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[E_RELOAD]], 1
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP3:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP3]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[C]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[C]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;
Index: llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to extract blocks that contain PHINodes, and selectively
+; store into it's respective block, creating a new block if needed.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* null, i32 -1)
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[PHI_BLOCK:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[PHI_BLOCK]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[TMP3:%.*]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       phi_block:
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret void
+;
Index: llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to extract blocks that contain PHINodes, and selectively
+; store into it's respective block, only using if needed.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* null, i32 -1)
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void  @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_0:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret void
+;
Index: llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do not extract similar regions that would involve the splitting
+; of phi nodes on exit.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  %2 = add i32 %d, 1
+  %3 = add i32 %e, 1
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = add i32 %d, 1
+  %2 = add i32 %e, 1
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[D_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[D_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[E_LOC]], i32* [[D_LOC]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    [[D_RELOAD:%.*]] = load i32, i32* [[D_LOC]], align 4
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[D_RELOAD]], i32 [[E_RELOAD]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[D_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[E_LOC]], i32* [[D_LOC]], i32* null, i32 1)
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    [[D_RELOAD:%.*]] = load i32, i32* [[D_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[D_RELOAD]], i32 [[E_RELOAD]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP4:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_0:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_0:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    store i32 [[D]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[D]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;
+;
+; CHECK-LABEL: @outlined_ir_func_1(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[NEXT_TO_OUTLINE:%.*]]
+; CHECK:       next_to_outline:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1:%.*]], 1
+; CHECK-NEXT:    br label [[NEXT_AFTER_OUTLINE_EXITSTUB:%.*]]
+; CHECK:       next_after_outline.exitStub:
+; CHECK-NEXT:    ret void
+;
Index: llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Here we have multiple exits, but the different sources, same outputs are
+; needed, this checks that they are compressed, and moved into the appropriate
+; output blocks.
+
+define void @outline_outputs1() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %diff = phi i32 [%aval, %block_4], [%a2val, %block_5]
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %diff = phi i32 [%aval, %block_4], [%a2val, %block_5]
+  ret void
+}
+
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIFF_CE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]])
+; CHECK-NEXT:    [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIFF_CE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]])
+; CHECK-NEXT:    [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK:  define internal void @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
+; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
+; CHECK:       block_3:
+; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
+; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
+; CHECK:       block_4:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_SPLIT:%.*]]
+; CHECK:       block_5:
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_SPLIT]]
+; CHECK:       block_6.split:
+; CHECK-NEXT:    [[DIFF_CE:%.*]] = phi i32 [ [[AVAL]], [[BLOCK_4]] ], [ [[A2VAL]], [[BLOCK_5]] ]
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    store i32 [[DIFF_CE]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
Index: llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
===================================================================
--- llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
+++ llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
@@ -37,42 +37,50 @@
 }
 ; CHECK-LABEL: @function1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
-; CHECK-NEXT:    br label [[TEST1:%.*]]
-; CHECK:       test1:
-; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
 ; CHECK-NEXT:    br label [[FIRST:%.*]]
-; CHECK:       test:
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]])
-; CHECK-NEXT:    br label [[FIRST]]
 ; CHECK:       first:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: @function2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
-; CHECK-NEXT:    br label [[TEST1:%.*]]
-; CHECK:       test1:
-; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
 ; CHECK-NEXT:    br label [[FIRST:%.*]]
-; CHECK:       test:
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]])
-; CHECK-NEXT:    br label [[FIRST]]
 ; CHECK:       first:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: define internal void @outlined_ir_func_0(
+; CHECK-LABEL: define internal void @outlined_ir_func_0(
 ; CHECK-NEXT:  newFuncRoot:
-; CHECK-NEXT:    br label [[TEST_TO_OUTLINE:%.*]]
-; CHECK:       test_to_outline:
-; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
 ; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
 ; CHECK:       first.exitStub:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
Index: llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do not extract similar regions that would involve the splitting
+; of phi nodes on exit.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 1)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[PHI_BLOCK:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[PHI_BLOCK]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[TMP3:%.*]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       phi_block:
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;
Index: llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to propogate inputs to the region into the split PHINode
+; outside of the region if necessary.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  %z = add i32 %c, %c
+  br i1 true, label %test1, label %first
+test1:
+  %e = load i32, i32* %0, align 4
+  %1 = add i32 %c, %c
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %2 = phi i32 [ %d, %test ], [ %e, %test1 ], [ %c, %entry ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  %z = mul i32 %c, %c
+  br i1 true, label %test1, label %first
+test1:
+  %e = load i32, i32* %0, align 4
+  %1 = add i32 %c, %c
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %2 = phi i32 [ %d, %test ], [ %e, %test1 ], [ %c, %entry ]
+  ret void
+next:
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32 [[C]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = mul i32 [[C]], [[C]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32 [[C]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    br i1 true, label [[TEST1:%.*]], label [[FIRST_SPLIT:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[D]], [[TEST]] ], [ [[E]], [[TEST1]] ], [ [[TMP1]], [[ENTRY_TO_OUTLINE]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    ret i1 true
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    ret i1 false
+;