Index: llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
===================================================================
--- llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -127,6 +127,8 @@
   /// to a less than form.  It is None otherwise.
   Optional<CmpInst::Predicate> RevisedPredicate;
 
+  SmallVector<int, 4> RelativeBlockLocations;
+
   /// Gather the information that is difficult to gather for an Instruction, or
   /// is changed. i.e. the operands of an Instruction and the Types of those
   /// operands. This extra information allows for similarity matching to make
@@ -145,6 +147,9 @@
   /// \return the consistent comparison predicate. 
   static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
 
+  void
+  setBranchSuccessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger);
+
   /// Hashes \p Value based on its opcode, types, and operand types.
   /// Two IRInstructionData instances produce the same hash when they perform
   /// the same operation.
@@ -189,6 +194,11 @@
           llvm::hash_value(ID.Inst->getType()),
           llvm::hash_value(CI->getCalledFunction()->getName().str()),
           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
+    else if (BranchInst *BI = dyn_cast<BranchInst>(ID.Inst))
+      return hash_combine(
+          llvm::hash_value(ID.Inst->getOpcode()),
+          llvm::hash_value(ID.Inst->getType()),
+          llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
     return llvm::hash_combine(
         llvm::hash_value(ID.Inst->getOpcode()),
         llvm::hash_value(ID.Inst->getType()),
@@ -284,10 +294,14 @@
   /// The next available integer to assign to a legal Instruction to.
   unsigned LegalInstrNumber = 0;
 
+  unsigned BBNumber = 0;
+
   /// Correspondence from IRInstructionData to unsigned integers.
   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
       InstructionIntegerMap;
 
+  DenseMap<BasicBlock *, unsigned> BasicBlockToInteger;
+
   /// Set if we added an illegal number in the previous step.
   /// Since each illegal number is unique, we only need one of them between
   /// each range of legal numbers. This lets us make sure we don't add more
@@ -329,6 +343,16 @@
 
   IRInstructionDataList *IDL = nullptr;
 
+  void initializeForBBs(Function &F) {
+    for (BasicBlock &BB : F)
+      BasicBlockToInteger.insert(std::make_pair(&BB, BBNumber++));
+  }
+
+  void initializeForBBs(Module &M) {
+    for (Function &F : M)
+      initializeForBBs(F);
+  }
+
   /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
   /// determined by \p InstrType. Two Instructions are mapped to the same value
   /// if they are close as defined by the InstructionData class above.
@@ -386,7 +410,11 @@
     InstructionClassification() {}
 
     // TODO: Determine a scheme to resolve when the label is similar enough.
-    InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
+    InstrType visitBranchInst(BranchInst &BI) { 
+      if (EnableBranches)
+        return Legal;
+      return Illegal;
+    }
     // TODO: Determine a scheme to resolve when the labels are similar enough.
     InstrType visitPHINode(PHINode &PN) { return Illegal; }
     // TODO: Handle allocas.
@@ -419,6 +447,8 @@
     // TODO: Handle interblock similarity.
     InstrType visitTerminator(Instruction &I) { return Illegal; }
     InstrType visitInstruction(Instruction &I) { return Legal; }
+
+    bool EnableBranches = false;
   };
 
   /// Maps an Instruction to a member of InstrType.
@@ -496,6 +526,8 @@
   DenseMap<unsigned, unsigned> CanonNumToNumber;
   /// @}
 
+  DenseSet<BasicBlock *> BasicBlocks;
+
 public:
   /// \param StartIdx - The starting location of the region.
   /// \param Len - The length of the region.
@@ -631,6 +663,9 @@
   /// \returns The BasicBlock the IRSimilarityCandidate ends in.
   BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
 
+  /// \returns The BasicBlock the IRSimilarityCandidate ends in.
+  DenseSet<BasicBlock *> &getBasicBlocks() { return BasicBlocks; }
+
   /// \returns The Function that the IRSimilarityCandidate is located in.
   Function *getFunction() { return getStartBB()->getParent(); }
 
Index: llvm/include/llvm/Transforms/IPO/IROutliner.h
===================================================================
--- llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -90,6 +90,14 @@
   /// call.
   bool ChangedArgOrder = false;
 
+  /// Marks whether this region ends in a branch, there is special handling
+  /// required for the following basic blocks in this case.
+  bool EndsInBranch = false;
+
+  /// The PHIBlocks with their corresponding return block based on the return
+  /// value as the key.
+  DenseMap<Value *, BasicBlock *> PHIBlocks;
+
   /// Mapping of the argument number in the deduplicated function
   /// to a given constant, which is used when creating the arguments to the call
   /// to the newly created deduplicated function.  This is handled separately
@@ -179,6 +187,10 @@
   /// for extraction.
   bool IgnoreGroup = false;
 
+  /// Keeping track of how many different branches outside the region the
+  /// regions in the group perform.
+  unsigned BranchesToOutside = 0;
+
   /// The return blocks for the overall function.
   DenseMap<Value *, BasicBlock *> EndBBs;
 
@@ -391,7 +403,7 @@
     InstructionAllowed() {}
 
     // TODO: Determine a scheme to resolve when the label is similar enough.
-    bool visitBranchInst(BranchInst &BI) { return false; }
+    bool visitBranchInst(BranchInst &BI) { return true; }
     // TODO: Determine a scheme to resolve when the labels are similar enough.
     bool visitPHINode(PHINode &PN) { return false; }
     // TODO: Handle allocas.
@@ -430,6 +442,8 @@
     // TODO: Handle interblock similarity.
     bool visitTerminator(Instruction &I) { return false; }
     bool visitInstruction(Instruction &I) { return true; }
+
+    bool EnableBranches = false;
   };
 
   /// A InstVisitor used to exclude certain instructions from being outlined.
Index: llvm/lib/Analysis/IRSimilarityIdentifier.cpp
===================================================================
--- llvm/lib/Analysis/IRSimilarityIdentifier.cpp
+++ llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -23,6 +23,10 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
+cl::opt<bool> EnableBranches(
+    "ir-sim-branches", cl::init(false), cl::ReallyHidden,
+    cl::desc("enable similarity mactching across branches."));
+
 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
                                      IRInstructionDataList &IDList)
     : Inst(&I), Legal(Legality), IDL(&IDList) {
@@ -49,6 +53,30 @@
   }
 }
 
+void IRInstructionData::setBranchSuccessors(
+    DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
+  assert(isa<BranchInst>(Inst) && "Instruction must be branch");
+
+  BranchInst *BI = cast<BranchInst>(Inst);
+  DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
+
+  BBNumIt = BasicBlockToInteger.find(BI->getParent());
+  assert(BBNumIt != BasicBlockToInteger.end() &&
+         "Could not find location for BasicBlock!");
+
+  int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
+
+  for (BasicBlock *Successor : BI->successors()) {
+    BBNumIt = BasicBlockToInteger.find(Successor);
+    assert(BBNumIt != BasicBlockToInteger.end() &&
+           "Could not find number for BasicBlock!");
+    int OtherBlockNumber = static_cast<int>(BBNumIt->second);
+
+    int Relative = OtherBlockNumber - CurrentBlockNumber;
+    RelativeBlockLocations.push_back(Relative);
+  }
+}
+
 CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
   switch (CI->getPredicate()) {
   case CmpInst::FCMP_OGT:
@@ -143,6 +171,11 @@
       return false;
   }
 
+  if (isa<BranchInst>(A.Inst) && isa<BranchInst>(B.Inst)) {
+    if (A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size())
+      return false;
+  }
+
   return true;
 }
 
@@ -156,10 +189,6 @@
   std::vector<unsigned> IntegerMappingForBB;
   std::vector<IRInstructionData *> InstrListForBB;
 
-  HaveLegalRange = false;
-  CanCombineWithPrevInstr = false;
-  AddedIllegalLastTime = true;
-
   for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
     switch (InstClassifier.visit(*It)) {
     case InstrType::Legal:
@@ -175,7 +204,8 @@
   }
 
   if (HaveLegalRange) {
-    mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
+    if (AddedIllegalLastTime)
+      mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
     for (IRInstructionData *ID : InstrListForBB)
       this->IDL->push_back(*ID);
     llvm::append_range(InstrList, InstrListForBB);
@@ -203,6 +233,9 @@
   IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
   InstrListForBB.push_back(ID);
 
+  if (isa<BranchInst>(*It))
+    ID->setBranchSuccessors(BasicBlockToInteger);
+
   // Add to the instruction list
   bool WasInserted;
   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
@@ -322,6 +355,9 @@
       NumberToValue.try_emplace(LocalValNumber, ID->Inst);
       LocalValNumber++;
     }
+
+    BasicBlock *BB = ID->Inst->getParent();
+    BasicBlocks.insert(BB);
   }
 
   // Setting the first and last instruction data pointers for the candidate.  If
@@ -640,6 +676,43 @@
             {A, OperValsA, ValueNumberMappingA},
             {B, OperValsB, ValueNumberMappingB}))
       return false;
+
+    // Here we check that between two corresponding branch instructions,
+    // when the branch targets a basic block within the same region, the
+    // relative locations are the same, and otherwise, are targeting basic
+    // blocks outside the region at the same locations.
+    if (isa<BranchInst>(ItA->Inst) && isa<BranchInst>(ItB->Inst)) {
+      if (ItA->RelativeBlockLocations.size() !=
+          ItB->RelativeBlockLocations.size())
+        return false;
+
+      if (OperValsA.size() != OperValsB.size())
+        return false;
+
+      detail::zippy<detail::zip_shortest, SmallVector<int, 4> &,
+                    SmallVector<int, 4> &, ArrayRef<Value *> &,
+                    ArrayRef<Value *> &>
+          ZippedRelativeLocations =
+              zip(ItA->RelativeBlockLocations, ItB->RelativeBlockLocations,
+                  OperValsA, OperValsB);
+      if (!all_of(ZippedRelativeLocations,
+                  [&A, &B](std::tuple<int, int, Value *, Value *> R) {
+                    BasicBlock *ABB = static_cast<BasicBlock *>(std::get<2>(R));
+                    BasicBlock *BBB = static_cast<BasicBlock *>(std::get<3>(R));
+                    bool AContained = false;
+                    bool BContained = false;
+                    if (A.BasicBlocks.find(ABB) != A.BasicBlocks.end())
+                      AContained = true;
+                    if (B.BasicBlocks.find(BBB) != B.BasicBlocks.end())
+                      BContained = true;
+                    if (AContained != BContained)
+                      return false;
+                    if (AContained)
+                      return std::get<0>(R) == std::get<1>(R);
+                    return true;
+                  }))
+        return false;
+    }
   }
   return true;
 }
@@ -665,6 +738,8 @@
   std::vector<unsigned> IntegerMappingForModule;
   // Iterate over the functions in the module to map each Instruction in each
   // BasicBlock to an unsigned integer.
+  Mapper.initializeForBBs(M);
+
   for (Function &F : M) {
 
     if (F.empty())
@@ -672,15 +747,16 @@
 
     for (BasicBlock &BB : F) {
 
-      if (BB.sizeWithoutDebug() < 2)
-        continue;
-
       // BB has potential to have similarity since it has a size greater than 2
       // and can therefore match other regions greater than 2. Map it to a list
       // of unsigned integers.
       Mapper.convertToUnsignedVec(BB, InstrListForModule,
                                   IntegerMappingForModule);
     }
+
+    BasicBlock::iterator It = F.begin()->end();
+    Mapper.mapToIllegalUnsigned(It, IntegerMappingForModule, InstrListForModule,
+                                true);
   }
 
   // Insert the InstrListForModule at the end of the overall InstrList so that
@@ -914,6 +990,7 @@
 void IRSimilarityIdentifier::findCandidates(
     std::vector<IRInstructionData *> &InstrList,
     std::vector<unsigned> &IntegerMapping) {
+
   SuffixTree ST(IntegerMapping);
 
   std::vector<IRSimilarityCandidate> CandsForRepSubstring;
@@ -932,12 +1009,15 @@
       continue;
 
     findCandidateStructures(CandsForRepSubstring, StructuralGroups);
-    for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups)
+    for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups) {
       // We only add the group if it contains more than one
       // IRSimilarityCandidate.  If there is only one, that means there is no
       // other repeated subsequence with the same structure.
-      if (Group.second.size() > 1)
+
+      if (Group.second.size() > 1) {
         SimilarityCandidates->push_back(Group.second);
+      }
+    }
 
     CandsForRepSubstring.clear();
     StructuralGroups.clear();
@@ -951,6 +1031,7 @@
 
   std::vector<IRInstructionData *> InstrList;
   std::vector<unsigned> IntegerMapping;
+  Mapper.InstClassifier.EnableBranches = EnableBranches;
 
   populateMapper(Modules, InstrList, IntegerMapping);
   findCandidates(InstrList, IntegerMapping);
@@ -960,6 +1041,7 @@
 
 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
   resetSimilarityCandidates();
+  Mapper.InstClassifier.EnableBranches = EnableBranches;
 
   std::vector<IRInstructionData *> InstrList;
   std::vector<unsigned> IntegerMapping;
Index: llvm/lib/Transforms/IPO/IROutliner.cpp
===================================================================
--- llvm/lib/Transforms/IPO/IROutliner.cpp
+++ llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -34,6 +34,8 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
+extern cl::opt<bool> EnableBranches;
+
 // Set to true if the user wants the ir outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
 // functions. Since the outliner is confined to a single module (modulo LTO),
@@ -67,13 +69,14 @@
 void OutlinableRegion::splitCandidate() {
   assert(!CandidateSplit && "Candidate already split!");
 
+  Instruction *EndInst = Candidate->end()->Inst;
+  Instruction *BackInst = Candidate->backInstruction();
 
-  if (Candidate->end()->Inst !=
-      Candidate->backInstruction()->getNextNonDebugInstruction())
-    return;
+  if (!BackInst->isTerminator())
+    if (EndInst != BackInst->getNextNonDebugInstruction())
+      return;
 
   Instruction *StartInst = (*Candidate->begin()).Inst;
-  Instruction *EndInst = (*Candidate->end()).Inst;
   assert(StartInst && EndInst && "Expected a start and end instruction?");
   StartBB = StartInst->getParent();
   PrevBB = StartBB;
@@ -97,10 +100,14 @@
 
   StartBB = PrevBB->splitBasicBlock(StartInst, OriginalName + "_to_outline");
 
-  // This is the case for the inner block since we do not have to include
-  // multiple blocks.
-  EndBB = StartBB;
-  FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
+  if (!BackInst->isTerminator()) {
+    EndBB = EndInst->getParent();
+    FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
+  } else {
+    EndBB = BackInst->getParent();
+    EndsInBranch = true;
+    FollowBB = nullptr;
+  }
 
   CandidateSplit = true;
 }
@@ -123,7 +130,6 @@
   //   inst3
   //   inst4
   assert(StartBB != nullptr && "StartBB for Candidate is not defined!");
-  assert(FollowBB != nullptr && "StartBB for Candidate is not defined!");
 
   // StartBB should only have one predecessor since we put an unconditional
   // branch at the end of PrevBB when we split the BasicBlock.
@@ -132,21 +138,27 @@
          "No Predecessor for the region start basic block!");
 
   assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!");
-  assert(EndBB->getTerminator() && "Terminator removed from EndBB!");
   PrevBB->getTerminator()->eraseFromParent();
-  EndBB->getTerminator()->eraseFromParent();
+  if (!EndsInBranch) {
+    assert(FollowBB != nullptr && "FollowBB for Candidate is not defined!");
+    assert(EndBB->getTerminator() && "Terminator removed from EndBB!");
+    EndBB->getTerminator()->eraseFromParent();
+  }
 
   moveBBContents(*StartBB, *PrevBB);
 
   BasicBlock *PlacementBB = PrevBB;
   if (StartBB != EndBB)
     PlacementBB = EndBB;
-  moveBBContents(*FollowBB, *PlacementBB);
+  if (!EndsInBranch)
+    moveBBContents(*FollowBB, *PlacementBB);
 
   PrevBB->replaceSuccessorsPhiUsesWith(StartBB, PrevBB);
-  PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB);
+  if (!EndsInBranch) {
+    PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB);
+    FollowBB->eraseFromParent();
+  }
   StartBB->eraseFromParent();
-  FollowBB->eraseFromParent();
 
   // Make sure to save changes back to the StartBB.
   StartBB = PrevBB;
@@ -204,19 +216,27 @@
   // division instruction for targets that have a native division instruction.
   // To be overly conservative, we only add 1 to the number of instructions for
   // each division instruction.
-  for (Instruction &I : *StartBB) {
-    switch (I.getOpcode()) {
-    case Instruction::FDiv:
-    case Instruction::FRem:
-    case Instruction::SDiv:
-    case Instruction::SRem:
-    case Instruction::UDiv:
-    case Instruction::URem:
-      Benefit += 1;
-      break;
-    default:
-      Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
-      break;
+  DenseSet<BasicBlock *> CandidateBlocks;
+  for (IRInstructionData &ID : *Candidate) {
+    BasicBlock *BB = ID.Inst->getParent();
+    CandidateBlocks.insert(BB);
+  }
+  
+  for (BasicBlock *BB : CandidateBlocks) {
+    for (Instruction &I : *BB) {
+      switch (I.getOpcode()) {
+      case Instruction::FDiv:
+      case Instruction::FRem:
+      case Instruction::SDiv:
+      case Instruction::SRem:
+      case Instruction::UDiv:
+      case Instruction::URem:
+        Benefit += 1;
+        break;
+      default:
+        Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+        break;
+      }
     }
   }
 
@@ -316,8 +336,25 @@
                                      unsigned FunctionNameSuffix) {
   assert(!Group.OutlinedFunction && "Function is already defined!");
 
+  Type *RetTy = Type::getVoidTy(M.getContext());
+  // All extracted functions _should_ have the same return type at this point.
+  // since the similarity identifier ensures that all branches outside of the
+  // region occur in the same place.
+
+  // NOTE: Should we ever move to the model that uses a switch at every point
+  // needed, meaning that we could branch within the region or out, it is
+  // possible that we will need to switch to using the most general case all of
+  // the time.
+  for (OutlinableRegion *R : Group.Regions) {
+    Type *ExtractedFuncType = R->ExtractedFunction->getReturnType();
+    if (RetTy->isVoidTy() && !ExtractedFuncType->isVoidTy())
+      RetTy = ExtractedFuncType;
+    else if (RetTy->isIntegerTy(1) && RetTy->isIntegerTy(16))
+      RetTy = ExtractedFuncType;
+  }
+
   Group.OutlinedFunctionType = FunctionType::get(
-      Type::getVoidTy(M.getContext()), Group.ArgumentTypes, false);
+      RetTy, Group.ArgumentTypes, false);
 
   // These functions will only be called from within the same module, so
   // we can set an internal linkage.
@@ -378,6 +415,7 @@
 static void moveFunctionData(Function &Old, Function &New,
                              DenseMap<Value *, BasicBlock *> &NewEnds) {
   Function::iterator CurrBB, NextBB, FinalBB;
+  // BasicBlock * NewEnds = nullptr;
   std::vector<Instruction *> DebugInsts;
   for (CurrBB = Old.begin(), FinalBB = Old.end(); CurrBB != FinalBB;
        CurrBB = NextBB) {
@@ -691,10 +729,51 @@
 /// \param [in] Outputs - The values found by the code extractor.
 static void
 findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
-                                          ArrayRef<Value *> Outputs) {
+                                          SetVector<Value *> &Outputs) {
   OutlinableGroup &Group = *Region.Parent;
   IRSimilarityCandidate &C = *Region.Candidate;
 
+  std::vector<BasicBlock *> BE;
+  DenseSet<BasicBlock *> BBSet;
+  for (IRInstructionData &ID : C) {
+    BasicBlock *BB = ID.Inst->getParent();
+    if (BBSet.contains(BB))
+      continue;
+    BE.push_back(BB);
+    BBSet.insert(BB);
+  }
+
+  SmallPtrSet<BasicBlock *, 1> Exits;
+  for (BasicBlock *Block : BE) {
+    for (BasicBlock *Succ : successors(Block)) {
+      if (!BBSet.contains(Succ)) {
+        Exits.insert(Succ);
+      }
+    }
+  }
+
+  // After determining which blocks exit to PHINodes, we add these PHINodes to
+  // the set of outputs to be processed.
+  DenseSet<Value *> PHIWrapped;
+  for (BasicBlock *ExitBB : Exits) {
+    for (PHINode &PN : ExitBB->phis()) {
+      // Find all incoming values from the outlining region.
+      SmallVector<unsigned, 2> IncomingVals;
+      for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+        if (BBSet.contains(PN.getIncomingBlock(i)))
+          IncomingVals.push_back(i);
+
+      // Do not process PHI if there is one (or fewer) predecessor from region.
+      if (IncomingVals.size() <= 1)
+        continue;
+      
+      Outputs.insert(&PN);
+
+      for (unsigned i : IncomingVals)
+        PHIWrapped.insert(PN.getIncomingValue(i));
+    }
+  }
+
   // This counts the argument number in the extracted function.
   unsigned OriginalIndex = Region.NumExtractedInputs;
 
@@ -709,6 +788,7 @@
   // type to the overall argument type list. We also store the GVNs used for
   // stores to identify which values will need to be moved into an special
   // block that holds the stores to the output registers.
+  SmallVector<unsigned, 1> GlobalValues;
   for (Value *Output : Outputs) {
     TypeFound = false;
     // We can do this since it is a result value, and will have a number
@@ -716,9 +796,35 @@
     // do not have to be in same order, but are functionally the same, we will
     // have to use a different scheme, as one-to-one correspondence is not
     // guaranteed.
-    unsigned GlobalValue = C.getGVN(Output).getValue();
+    GlobalValues.clear();
+
+    // Since values outside the region can be combined into PHINode when we
+    // have multiple exists, we check if we have a PHINode in the output, to
+    // mark that both of these outputs are getting caught.
+
+    // If we cannot find a GVN, this means that the input to the PHINode is not
+    // included in the region we are trying to analyze, meaning, that if it was
+    // outlined, we would be adding an extra input.  We ignore this case for
+    // now, and so ignore the region.
+
+    // TODO: Adapt to the extra input from the PHINode.
+    if (PHINode *PN = dyn_cast<PHINode>(Output)) {
+      for (unsigned i = 0; i < PN->getNumIncomingValues(); ++i) {
+        Value *V = PN->getIncomingValue(i);
+        if (!C.getGVN(V).hasValue()) {
+          Region.IgnoreRegion = true;
+          return;
+        }
+        GlobalValues.push_back(C.getGVN(V).getValue());
+      }
+    } else
+      GlobalValues.push_back(C.getGVN(Output).getValue());
     unsigned ArgumentSize = Group.ArgumentTypes.size();
 
+    // If the output is combined in a PHINode, we make sure to skip over it.
+    if (PHIWrapped.contains(Output))
+      continue;
+
     for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
       if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
         continue;
@@ -730,7 +836,8 @@
       AggArgsUsed.insert(Jdx);
       Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
       Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+      for (unsigned GlobalValue : GlobalValues)
+        Region.GVNStores.push_back(GlobalValue);
       break;
     }
 
@@ -744,7 +851,8 @@
           std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
       Region.AggArgToExtracted.insert(
           std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+      for (unsigned GlobalValue : GlobalValues)
+        Region.GVNStores.push_back(GlobalValue);
     }
 
     stable_sort(Region.GVNStores);
@@ -770,7 +878,7 @@
 
   // Map the outputs found by the CodeExtractor to the arguments found for
   // the overall function.
-  findExtractedOutputToOverallOutputMapping(Region, Outputs.getArrayRef());
+  findExtractedOutputToOverallOutputMapping(Region, Outputs);
 }
 
 /// Replace the extracted function in the Region with a call to the overall
@@ -869,6 +977,9 @@
 
   // Transfer any debug information.
   Call->setDebugLoc(Region.Call->getDebugLoc());
+  // Since our output may determine which branch we go to, we make sure to
+  // propogate this new call value through the module.
+  OldCall->replaceAllUsesWith(Call);
 
   // Remove the old instruction.
   OldCall->eraseFromParent();
@@ -945,16 +1056,96 @@
         continue;
       ReturnInst *RI = cast<ReturnInst>(Term);
       Value *RetVal = RI->getReturnValue();
-      DenseMap<Value *, BasicBlock *>::iterator VBBIt = OutputBBs.find(RetVal);
+      DenseMap<Value *, BasicBlock *>::iterator VBBIt, PHIVBBIt;
+      VBBIt = OutputBBs.find(RetVal);
       assert(VBBIt != OutputBBs.end() && "Could not find output value!");
 
       Instruction *NewI = I->clone();
       NewI->setDebugLoc(DebugLoc());
       BasicBlock *OutputBB = VBBIt->second;
       OutputBB->getInstList().push_back(NewI);
-    LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
-                      << *OutputBB << "\n");
+      LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
+                        << *OutputBB << "\n");
+
+      // If this is storing a PHINode, we must make sure it is included in the
+      // overall function.
+      StoreInst *SI = static_cast<StoreInst *>(NewI);
+      if (!isa<PHINode>(SI->getValueOperand()))
+        continue;
+      PHINode *PN = cast<PHINode>(SI->getValueOperand());
+      // If it has a value, it was not split by the code extractor, which
+      // is what we are looking for.
+      if (Region.Candidate->getGVN(PN).hasValue())
+        continue;
+
+      // Find if a PHIBlock exists for this return value already.  If it is
+      // the first time we are anaylzing this, we will not, so we record it.
+      PHIVBBIt = Group.PHIBlocks.find(RetVal);
+      bool Inserted = false;
+      if (FirstFunction) {
+        BasicBlock *PHIBlock = PN->getParent();
+        std::tie(PHIVBBIt, Inserted) =
+            Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+      }
+
+      Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent()));
+
+      if (PHIVBBIt != Group.PHIBlocks.end())
+        continue;
+
+      // If we did not find a block, we create one, and insert it into the
+      // overall function and record it.
+      BasicBlock *PHIBlock = BasicBlock::Create(I->getContext(), "phi_block",
+                                                OutputBB->getParent());
+      std::tie(PHIVBBIt, Inserted) =
+          Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+      BranchInst::Create(OutputBB, PHIBlock);
+      // We find the predecessors in the overall function, and make sure
+      // we now branch to this new block.
+      for (BasicBlock *Pred : predecessors(PHIBlock)) {
+        Instruction *Term = Pred->getTerminator();
+        BranchInst *BI = static_cast<BranchInst *>(Term);
+        BI->setSuccessor(0, PHIBlock);
+      }
+      BasicBlock *OverallPhiBlock = PHIVBBIt->second;
 
+      // If this is the first function, we do not need to worry about mergiing
+      // this with any other block in the overall outlined function
+      if (FirstFunction)
+        continue;
+
+      // For our PHINode, we find the combined canonical numbering, and
+      // attempt to find a matching PHINode in the overall PHIBlock.  If we
+      // cannot, we copy the PHINode and move it into this new block.
+      DenseSet<unsigned> PNGVNs;
+      for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) {
+        Value *IVal = PN->getIncomingValue(Idx);
+        Optional<unsigned> GVN = Region.Candidate->getGVN(IVal);
+        assert(GVN.hasValue() && "No GVN for incoming value");
+        Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN);
+        PNGVNs.insert(*CanonNum);
+      }
+      for (Instruction &Inst : *OverallPhiBlock) {
+        if (!isa<PHINode>(Inst))
+          continue;
+        bool Found = true;
+        PHINode *CurrPN = static_cast<PHINode *>(&Inst);
+        for (unsigned Idx = 0; Idx < CurrPN->getNumIncomingValues(); Idx++) {
+          Value *IVal = CurrPN->getIncomingValue(Idx);
+          OutlinableRegion *First = Group.Regions[0];
+          Optional<unsigned> GVN = First->Candidate->getGVN(IVal);
+          assert(GVN.hasValue() && "No GVN for incoming value");
+          Optional<unsigned> CanonNum = First->Candidate->getCanonicalNum(*GVN);
+          if (!PNGVNs.contains(*CanonNum)) {
+            Found = false;
+            break;
+          }
+        }
+        if (!Found) {
+          PHINode *NewPN = static_cast<PHINode *>(PN->clone());
+          NewPN->insertBefore(&*OverallPhiBlock->begin());
+        }
+      }
     }
 
     // If we added an edge for basic blocks without a predecessor, we remove it
@@ -1058,24 +1249,23 @@
 
       BasicBlock *CompBB = VToB.second;
       BasicBlock *OutputBB = OutputBBIt->second;
-    if (CompBB->size() - 1 != OutputBB->size()) {
-      WrongSize = true;
+      if (CompBB->size() - 1 != OutputBB->size()) {
+        WrongSize = true;
         break;
-    }
+      }
 
-    WrongSize = false;
-    BasicBlock::iterator NIt = OutputBB->begin();
-    for (Instruction &I : *CompBB) {
-      if (isa<BranchInst>(&I))
-        continue;
+      BasicBlock::iterator NIt = OutputBB->begin();
+      for (Instruction &I : *CompBB) {
+        if (isa<BranchInst>(&I))
+          continue;
 
-      if (!I.isIdenticalTo(&(*NIt))) {
-        WrongInst = true;
-        break;
-      }
+        if (!I.isIdenticalTo(&(*NIt))) {
+          WrongInst = true;
+          break;
+        }
 
-      NIt++;
-    }
+        NIt++;
+      }
     }
 
     if (!WrongInst && !WrongSize)
@@ -1103,10 +1293,11 @@
     OutlinableGroup &OG, OutlinableRegion &Region,
     DenseMap<Value *, BasicBlock *> OutputBBs,
     DenseMap<Value *, BasicBlock *> EndBBs,
-                            const DenseMap<Value *, Value *> &OutputMappings,
+    const DenseMap<Value *, Value *> &OutputMappings,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   DenseSet<unsigned> ValuesToFind(Region.GVNStores.begin(),
                                   Region.GVNStores.end());
+  //errs() << "Num values to find " << ValuesToFind.size() << "\n";
 
   // We iterate over the instructions in the extracted function, and find the
   // global value number of the instructions.  If we find a value that should
@@ -1119,6 +1310,13 @@
       ExcludeBBs.insert(VBPair.second);
   for (std::pair<Value *, BasicBlock *> &VBPair : OutputBBs)
     ExcludeBBs.insert(VBPair.second);
+  // Add PHIBlocks to exclusionary zones since it is possible they may not be
+  // same across the extracted functions.
+  for (std::pair<Value *, BasicBlock *> &VBPair : Region.PHIBlocks)
+    ExcludeBBs.insert(VBPair.second);
+  for (std::pair<Value *, BasicBlock *> &VBPair : Region.Parent->PHIBlocks)
+    ExcludeBBs.insert(VBPair.second);
+
   std::vector<Instruction *> ExtractedFunctionInsts =
       collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs);
   std::vector<Instruction *> OverallFunctionInsts =
@@ -1138,6 +1336,30 @@
     // If we have found one of the stored values for output, replace the value
     // with the corresponding one from the overall function.
     if (GVN.hasValue() && ValuesToFind.erase(GVN.getValue())) {
+      Value *NewV = OverallFunctionInsts[Idx];
+      PHINode *OverallPN = nullptr;
+      // If there is a PHINode following the instruction, it means it is not
+      // really an output, but the following PHINode is, we make sure to update
+      // the uses of the PHINode as well to ensure that our output blocks in the
+      // overall function are consistent.
+      for (User *U : NewV->users()) {
+        if (PHINode *PN = dyn_cast<PHINode>(U)) {
+          OverallPN = PN;
+        }
+      }
+
+      for (User *U : V->users()) {
+        if (PHINode *PN = dyn_cast<PHINode>(U)) {
+          if (!OverallPN) {
+            BasicBlock *Old = static_cast<Instruction *>(V)->getParent();
+            BasicBlock *New = static_cast<Instruction *>(NewV)->getParent();
+            PN->replaceIncomingBlockWith(Old, New);
+            continue;
+          }
+          PN->replaceAllUsesWith(OverallPN);
+        }
+      }
+
       V->replaceAllUsesWith(OverallFunctionInsts[Idx]);
       if (ValuesToFind.size() == 0)
         break;
@@ -1201,8 +1423,8 @@
     NewBB = VtoBB.second;
     DenseMap<Value *, BasicBlock *>::iterator VBBIt =
         EndBBs.find(RetValueForBB);
-  LLVM_DEBUG(dbgs() << "Create output block for region in"
-                    << Region.ExtractedFunction << " to "
+    LLVM_DEBUG(dbgs() << "Create output block for region in"
+                      << Region.ExtractedFunction << " to "
                       << *NewBB);
     BranchInst::Create(VBBIt->second, NewBB);
     OutputStoreBBs.back().insert(std::make_pair(RetValueForBB, NewBB));
@@ -1233,17 +1455,17 @@
           M.getContext(),
           "final_block_" + Twine(static_cast<unsigned>(FinalBlockIdx++)),
           AggFunc);
-    Instruction *Term = EndBB->getTerminator();
-    Term->moveBefore(*ReturnBlock, ReturnBlock->end());
+      Instruction *Term = EndBB->getTerminator();
+      Term->moveBefore(*ReturnBlock, ReturnBlock->end());
       // Put the switch statement in the old end basic block for the function
       // with a fall through to the new return block.
-    LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
-                      << OutputStoreBBs.size() << "\n");
-    SwitchInst *SwitchI =
-        SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
-                           ReturnBlock, OutputStoreBBs.size(), EndBB);
+      LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
+                        << OutputStoreBBs.size() << "\n");
+      SwitchInst *SwitchI =
+          SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
+                             ReturnBlock, OutputStoreBBs.size(), EndBB);
 
-    unsigned Idx = 0;
+      unsigned Idx = 0;
       for (DenseMap<Value *, BasicBlock *> OutputStoreBB : OutputStoreBBs) {
         DenseMap<Value *, BasicBlock *>::iterator OSBBIt =
             OutputStoreBB.find(OutputBlock.first);
@@ -1253,9 +1475,9 @@
         BasicBlock *BB = OSBBIt->second;
         SwitchI->addCase(
             ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx), BB);
-      Term = BB->getTerminator();
-      Term->setSuccessor(0, ReturnBlock);
-      Idx++;
+        Term = BB->getTerminator();
+        Term->setSuccessor(0, ReturnBlock);
+        Idx++;
       }
     }
     return;
@@ -1273,10 +1495,10 @@
       BasicBlock *EndBB = EndBBIt->second;
       BasicBlock *OutputBB = VBPair.second;
       Instruction *Term = OutputBB->getTerminator();
-    Term->eraseFromParent();
-    Term = EndBB->getTerminator();
+      Term->eraseFromParent();
+      Term = EndBB->getTerminator();
       moveBBContents(*OutputBB, *EndBB);
-    Term->moveBefore(*EndBB, EndBB->end());
+      Term->moveBefore(*EndBB, EndBB->end());
       OutputBB->eraseFromParent();
     }
   }
@@ -1316,11 +1538,11 @@
   DenseMap<Value *, BasicBlock *> NewBBs;
   unsigned Idx = 0;
   for (std::pair<Value *, BasicBlock *> &VtoBB : CurrentGroup.EndBBs) {
-  BasicBlock *NewBB = BasicBlock::Create(
+    BasicBlock *NewBB = BasicBlock::Create(
         M.getContext(),
         Twine("output_block_") + Twine(static_cast<unsigned>(0)) + Twine("_") +
             Twine(static_cast<unsigned>(Idx++)),
-      CurrentGroup.OutlinedFunction);
+        CurrentGroup.OutlinedFunction);
     NewBBs.insert(std::make_pair(VtoBB.first, NewBB));
   }
   CurrentOS->OutputBlockNum = 0;
@@ -1340,13 +1562,12 @@
     RetValueForBB = VtoBB.first;
     NewBB = VtoBB.second;
   
-  if (NewBB->size() == 0) {
-    CurrentOS->OutputBlockNum = -1;
-    NewBB->eraseFromParent();
+    if (NewBB->size() == 0) {
+      NewBB->eraseFromParent();
       ToRemove.push_back(RetValueForBB);
       continue;
-  }
-
+    }
+    
     AllRemoved = false;
     DenseMap<Value *, BasicBlock *>::iterator VBBIt =
         CurrentGroup.EndBBs.find(RetValueForBB);
@@ -1387,11 +1608,11 @@
     DenseMap<Value *, BasicBlock *> NewBBs;
     unsigned BBIdx = 0;
     for (std::pair<Value *, BasicBlock *> &VtoBB : CurrentGroup.EndBBs) {
-    BasicBlock *NewBB = BasicBlock::Create(
+      BasicBlock *NewBB = BasicBlock::Create(
           M.getContext(),
           Twine("output_block_") + Twine(static_cast<unsigned>(Idx)) +
               Twine("_") + Twine(static_cast<unsigned>(BBIdx++)),
-        CurrentGroup.OutlinedFunction);
+          CurrentGroup.OutlinedFunction);
       NewBBs.insert(std::make_pair(VtoBB.first, NewBB));
     }
     replaceArgumentUses(*CurrentOS, NewBBs);
@@ -1419,19 +1640,21 @@
       return false;
     }
   
-  if (Region.Candidate->end()->Inst !=
-      Region.Candidate->backInstruction()->getNextNonDebugInstruction()) {
-    IRInstructionDataList *IDL = Region.Candidate->front()->IDL;
-    Instruction *NewEndInst =
-        Region.Candidate->backInstruction()->getNextNonDebugInstruction();
-    IRInstructionData *NewEndIRID = new (InstDataAllocator.Allocate())
-        IRInstructionData(*NewEndInst,
-                          InstructionClassifier.visit(*NewEndInst), *IDL);
-
-    // Insert the first IRInstructionData of the new region after the
-    // last IRInstructionData of the IRSimilarityCandidate.
-    IDL->insert(Region.Candidate->end(), *NewEndIRID);
-  }    
+  if (!Region.Candidate->backInstruction()->isTerminator()) {
+    if (Region.Candidate->end()->Inst !=
+        Region.Candidate->backInstruction()->getNextNonDebugInstruction()) {
+      IRInstructionDataList *IDL = Region.Candidate->front()->IDL;
+      Instruction *NewEndInst =
+          Region.Candidate->backInstruction()->getNextNonDebugInstruction();
+      IRInstructionData *NewEndIRID = new (InstDataAllocator.Allocate())
+          IRInstructionData(*NewEndInst,
+                            InstructionClassifier.visit(*NewEndInst), *IDL);
+
+      // Insert the first IRInstructionData of the new region after the
+      // last IRInstructionData of the IRSimilarityCandidate.
+      IDL->insert(Region.Candidate->end(), *NewEndIRID);
+    }   
+  } 
 
   bool BadInst = any_of(*IRSC, [this](IRInstructionData &ID) {
     // We check if there is a discrepancy between the InstructionDataList
@@ -1441,9 +1664,10 @@
     // Since we do not have any similarity data about this particular
     // instruction, we cannot confidently outline it, and must discard this
     // candidate.
-    if (std::next(ID.getIterator())->Inst !=
-        ID.Inst->getNextNonDebugInstruction())
-      return true;
+    if (!ID.Inst->isTerminator())
+      if (std::next(ID.getIterator())->Inst !=
+          ID.Inst->getNextNonDebugInstruction())
+        return true;
     return !this->InstructionClassifier.visit(ID.Inst);
   });
 
@@ -1476,9 +1700,14 @@
     if (PreviouslyOutlined)
       continue;
 
-    // TODO: If in the future we can outline across BasicBlocks, we will need to
-    // check all BasicBlocks contained in the region.
-    if (IRSC.getStartBB()->hasAddressTaken())
+    bool BBHasAddressTaken = false;
+    for (IRInstructionData &ID : IRSC) {
+      if (ID.Inst->getParent()->hasAddressTaken()) {
+        BBHasAddressTaken = true;
+        break;
+      }
+    }
+    if (BBHasAddressTaken)
       continue;
 
     if (IRSC.front()->Inst->getFunction()->hasLinkOnceODRLinkage() &&
@@ -1498,9 +1727,10 @@
       // Since we do not have any similarity data about this particular
       // instruction, we cannot confidently outline it, and must discard this
       // candidate.
-      if (std::next(ID.getIterator())->Inst !=
-          ID.Inst->getNextNonDebugInstruction())
-        return true;
+      if (!ID.Inst->isTerminator())
+        if (std::next(ID.getIterator())->Inst !=
+            ID.Inst->getNextNonDebugInstruction())
+          return true;
       return !this->InstructionClassifier.visit(ID.Inst);
     });
 
@@ -1567,10 +1797,35 @@
                                                OutlinableGroup &CurrentGroup,
                                                TargetTransformInfo &TTI) {
   InstructionCost OutputCost = 0;
+  unsigned NumOutputBranches = 0;
+
+  IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
+  DenseSet<BasicBlock *> CandidateBlocks;
+  for (IRInstructionData &ID : Candidate) {
+    BasicBlock *BB = ID.Inst->getParent();
+    CandidateBlocks.insert(BB);
+  }
+
+  for (IRInstructionData &ID : Candidate) {
+    if (!isa<BranchInst>(ID.Inst))
+      continue;
+
+    for (Value *V : ID.OperVals) {
+      BasicBlock *BB = static_cast<BasicBlock *>(V);
+      DenseSet<BasicBlock *>::iterator CBIt = CandidateBlocks.find(BB);
+      if (CBIt != CandidateBlocks.end())
+        continue;
+      NumOutputBranches++;
+    }
+  }
+
+  if (NumOutputBranches == 0)
+    NumOutputBranches++;
+
+  CurrentGroup.BranchesToOutside = NumOutputBranches;
 
   for (const ArrayRef<unsigned> &OutputUse :
        CurrentGroup.OutputGVNCombinations) {
-    IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
     for (unsigned GVN : OutputUse) {
       Optional<Value *> OV = Candidate.fromGVN(GVN);
       assert(OV.hasValue() && "Could not find value for GVN?");
@@ -1585,14 +1840,14 @@
       LLVM_DEBUG(dbgs() << "Adding: " << StoreCost
                         << " instructions to cost for output of type "
                         << *V->getType() << "\n");
-      OutputCost += StoreCost;
+      OutputCost += StoreCost * NumOutputBranches;
     }
 
     InstructionCost BranchCost =
         TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize);
     LLVM_DEBUG(dbgs() << "Adding " << BranchCost << " to the current cost for"
                       << " a branch instruction\n");
-    OutputCost += BranchCost;
+    OutputCost += BranchCost * NumOutputBranches;
   }
 
   // If there is more than one output scheme, we must have a comparison and
@@ -1611,7 +1866,7 @@
     LLVM_DEBUG(dbgs() << "Adding: " << TotalCost
                       << " instructions for each switch case for each different"
                       << " output path in a function\n");
-    OutputCost += TotalCost;
+    OutputCost += TotalCost * NumOutputBranches;
   }
 
   return OutputCost;
@@ -1702,7 +1957,6 @@
   Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands);
 
   assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!");
-  assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!");
   Function *OrigF = Region.StartBB->getParent();
   CodeExtractorAnalysisCache CEAC(*OrigF);
   Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC);
@@ -1716,7 +1970,9 @@
     return false;
   }
 
-  BasicBlock *RewrittenBB = Region.FollowBB->getSinglePredecessor();
+  User *InstAsUser = Region.ExtractedFunction->user_back();
+  BasicBlock *RewrittenBB = cast<Instruction>(InstAsUser)->getParent();
+  Region.PrevBB = RewrittenBB->getSinglePredecessor();
   Region.StartBB = RewrittenBB;
   Region.EndBB = RewrittenBB;
 
@@ -1759,6 +2015,7 @@
 
 unsigned IROutliner::doOutline(Module &M) {
   // Find the possible similarity sections.
+  InstructionClassifier.EnableBranches = EnableBranches;
   IRSimilarityIdentifier &Identifier = getIRSI(M);
   SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity();
 
@@ -1795,7 +2052,7 @@
     // each section in the set.
     NotSame.clear();
     CurrentGroup->findSameConstants(NotSame);
-
+  
     if (CurrentGroup->IgnoreGroup)
       continue;
 
@@ -1810,7 +2067,15 @@
       if (!OS->CandidateSplit)
         continue;
 
-      std::vector<BasicBlock *> BE = {OS->StartBB};
+      std::vector<BasicBlock *> BE;
+      DenseSet<BasicBlock *> BBSet;
+      for (IRInstructionData &ID : *OS->Candidate) {
+        BasicBlock *BB = ID.Inst->getParent();
+        if (BBSet.find(BB) != BBSet.end())
+          continue;
+        BE.push_back(BB);
+        BBSet.insert(BB);
+      }
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
@@ -1867,12 +2132,11 @@
 
   ExtractorAllocator.DestroyAll();
 
-  if (NegativeCostGroups.size() > 1)
-    llvm::stable_sort(NegativeCostGroups,
-                      [](const OutlinableGroup *LHS,
-                         const OutlinableGroup *RHS) {
-                        return LHS->Benefit - LHS->Cost > RHS->Benefit - RHS->Cost;
-                      });
+  if (NegativeCostGroups.size() > 1 && CostModel)
+    llvm::stable_sort(NegativeCostGroups, [](const OutlinableGroup *LHS,
+                                             const OutlinableGroup *RHS) {
+      return LHS->Benefit - LHS->Cost > RHS->Benefit - RHS->Cost;
+    });
 
   std::vector<Function *> FuncsToRemove;
   for (OutlinableGroup *CG : NegativeCostGroups) {
@@ -1901,7 +2165,15 @@
     // Create functions out of all the sections, and mark them as outlined.
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
-      std::vector<BasicBlock *> BE = {OS->StartBB};
+      std::vector<BasicBlock *> BE;
+      DenseSet<BasicBlock *> BBSet;
+      for (IRInstructionData &ID : *OS->Candidate) {
+        BasicBlock *BB = ID.Inst->getParent();
+        if (BBSet.find(BB) != BBSet.end())
+          continue;
+        BE.push_back(BB);
+        BBSet.insert(BB);
+      }
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
Index: llvm/test/Transforms/IROutliner/outlining-across-branch.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/outlining-across-branch.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; This checks that we are able to outline exactly the same branch structure
+; while also outlining similar items on either side of the branch.
+
+define void @outline_outputs1() #0 {
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  br label %next
+next:
+  store i32 2, i32* %output, align 4
+  store i32 3, i32* %result, align 4
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  br label %next
+next:
+  store i32 2, i32* %output, align 4
+  store i32 3, i32* %result, align 4
+  ret void
+}
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal void @outlined_ir_func_0(
+; CHECK:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_after_outline.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    store i32 2, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    store i32 2, i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[ENTRY_AFTER_OUTLINE_EXITSTUB:%.*]]
+;
Index: llvm/test/Transforms/IROutliner/outlining-basic-branches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/outlining-basic-branches.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; This checks that we are able to outline exactly the same structure without
+; any other items to outline.
+
+define void @outline_outputs1() #0 {
+entry:
+  br label %next
+next:
+  br label %next2
+next2:
+  br label %next
+next3:
+  %a = alloca i32, align 4
+  br label %next4
+next4:
+  br label %next3
+next5:
+  br label %next6
+next6:
+  %b = alloca i32, align 4
+  ret void
+}
+
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    call void @outlined_ir_func_0()
+; CHECK-NEXT:    br label [[NEXT]]
+; CHECK:       next3:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0()
+; CHECK-NEXT:    br label [[NEXT3:%.*]]
+; CHECK:       next5:
+; CHECK-NEXT:    br label [[NEXT6:%.*]]
+; CHECK:       next6:
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal void @outlined_ir_func_0(
+; CHECK:  newFuncRoot:
+; CHECK-NEXT:    br label [[NEXT_TO_OUTLINE:%.*]]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       next_to_outline:
+; CHECK-NEXT:    br label [[NEXT2:%.*]]
+; CHECK:       next2:
+; CHECK-NEXT:    br label [[NEXT_EXITSTUB:%.*]]
+;
Index: llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Here we have multiple exits, but the different sources, same outputs are
+; needed, this checks that they are compressed, and moved into the appropriate
+; output blocks.
+
+define void @outline_outputs1() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %diff = phi i32 [%aval, %block_4], [%a2val, %block_5]
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %diff = phi i32 [%aval, %block_4], [%a2val, %block_5]
+  ret void
+}
+
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIFF_CE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]])
+; CHECK-NEXT:    [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIFF_CE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]])
+; CHECK-NEXT:    [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK:  define internal void @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    store i32 [[DIFF_CE:%.*]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    ret void
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
+; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
+; CHECK:       block_3:
+; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
+; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
+; CHECK:       block_4:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_SPLIT:%.*]]
+; CHECK:       block_5:
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_SPLIT]]
+; CHECK:       block_6.split:
+; CHECK-NEXT:    [[DIFF_CE]] = phi i32 [ [[AVAL]], [[BLOCK_4]] ], [ [[A2VAL]], [[BLOCK_5]] ]
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
+;
Index: llvm/test/Transforms/IROutliner/outlining-conditional-match.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/outlining-conditional-match.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; This checks that we are able to outline branches that use a conditional branch
+; rather than just a implicit branch.
+
+define void @outline_outputs1() #0 {
+entry:
+  %0 = icmp eq i32 2, 0
+  br i1 %0, label %next, label %next2
+next:
+  br label %next3
+next2:
+  br label %next
+next3:
+  br label %next4
+next4:
+  %1 = icmp eq i32 2, 0
+  br i1 %0, label %next5, label %next6
+next5:
+  br label %next7
+next6:
+  br label %next5
+next7:
+  ret void
+}
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 2, 0
+; CHECK-NEXT:    call void @outlined_ir_func_0(i1 [[TMP0]])
+; CHECK-NEXT:    br label [[NEXT3:%.*]]
+; CHECK:       next3:
+; CHECK-NEXT:    br label [[NEXT4:%.*]]
+; CHECK:       next4:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 2, 0
+; CHECK-NEXT:    call void @outlined_ir_func_0(i1 [[TMP0]])
+; CHECK-NEXT:    br label [[NEXT7:%.*]]
+; CHECK:       next7:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal void @outlined_ir_func_0(
+; CHECK:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       next3.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    br i1 [[TMP0:%.*]], label [[NEXT:%.*]], label [[NEXT2:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    br label [[NEXT3_EXITSTUB:%.*]]
+; CHECK:       next2:
+; CHECK-NEXT:    br label [[NEXT]]
+;
Index: llvm/test/Transforms/IROutliner/outlining-multiple-exits-diff-outputs.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/outlining-multiple-exits-diff-outputs.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Here we have multiple exits, but the different sources, different outputs are
+; needed, this checks that they are handled by separate switch statements.
+
+define void @outline_outputs1() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_7
+block_6:
+  %div = udiv i32 %aval, %bval
+  ret void
+block_7:
+  %sub = sub i32 %a2val, %b2val
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_7
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %diff = sub i32 %a2val, %b2val
+  ret void
+block_7:
+  %quot = udiv i32 %add, %mul
+  ret void
+}
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[AVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]], i32 0)
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4
+; CHECK-NEXT:    [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BLOCK_6:%.*]], label [[BLOCK_7:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]]
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[MUL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[ADD_LOC]], i32* [[MUL_LOC]], i32 1)
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[MUL_RELOAD:%.*]] = load i32, i32* [[MUL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BLOCK_7:%.*]], label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    [[QUOT:%.*]] = udiv i32 [[ADD_RELOAD]], [[MUL_RELOAD]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal i1 @outlined_ir_func_0(
+; CHECK:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP8:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       block_7.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP8]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_0:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
+; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
+; CHECK:       block_3:
+; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
+; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
+; CHECK:       block_4:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
+; CHECK:       block_5:
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
+; CHECK-NEXT:    br label [[BLOCK_7_EXITSTUB:%.*]]
+; CHECK:       output_block_0_0:
+; CHECK-NEXT:    store i32 [[A2VAL]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    store i32 [[B2VAL]], i32* [[TMP5:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[AVAL]], i32* [[TMP6:%.*]], align 4
+; CHECK-NEXT:    store i32 [[BVAL]], i32* [[TMP7:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[A2VAL]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[B2VAL]], i32* [[TMP5]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;
Index: llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll
@@ -0,0 +1,208 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -ir-sim-branches -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Here we have multiple exits, but the different sources, same outputs are
+; needed, this checks that they are compressed, and moved into the appropriate
+; output blocks.
+
+define void @outline_outputs1() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_7
+block_6:
+  %div = udiv i32 %aval, %bval
+  ret void
+block_7:
+  %sub = sub i32 %a2val, %b2val
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_7
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %diff = sub i32 %a2val, %b2val
+  ret void
+block_7:
+  %quot = udiv i32 %aval, %bval
+  ret void
+}
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[AVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]])
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4
+; CHECK-NEXT:    [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[BLOCK_6:%.*]], label [[BLOCK_7:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]]
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[AVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]])
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4
+; CHECK-NEXT:    [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[BLOCK_7:%.*]], label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    [[QUOT:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal i1 @outlined_ir_func_0(
+; CHECK:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    store i32 [[AVAL:%.*]], i32* [[TMP6:%.*]], align 4
+; CHECK-NEXT:    store i32 [[BVAL:%.*]], i32* [[TMP7:%.*]], align 4
+; CHECK-NEXT:    ret i1 true
+; CHECK:       block_7.exitStub:
+; CHECK-NEXT:    store i32 [[A2VAL:%.*]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    store i32 [[B2VAL:%.*]], i32* [[TMP5:%.*]], align 4
+; CHECK-NEXT:    ret i1 false
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL]] = load i32, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
+; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
+; CHECK:       block_3:
+; CHECK-NEXT:    [[AVAL]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
+; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
+; CHECK:       block_4:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
+; CHECK:       block_5:
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
+; CHECK-NEXT:    br label [[BLOCK_7_EXITSTUB:%.*]]
+;
Index: llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
===================================================================
--- llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
+++ llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
@@ -726,22 +726,8 @@
   ASSERT_TRUE(UnsignedVec[0] != UnsignedVec[1]);
 }
 
-// In most cases, the illegal instructions we are collecting don't require any
-// sort of setup.  In these cases, we can just only have illegal instructions,
-// and the mapper will create 0 length vectors, and we can check that.
-
-// In cases where we have legal instructions needed to set up the illegal
-// instruction, to check illegal instructions are assigned unsigned integers
-// from the maximum value decreasing to 0, it will be greater than a legal
-// instruction that comes after.  So to check that we have an illegal
-// instruction, we place a legal instruction after an illegal instruction, and
-// check that the illegal unsigned integer is greater than the unsigned integer
-// of the legal instruction.
-
-// Checks that the branch is mapped to be illegal since there is extra checking
-// needed to ensure that a branch in one region is branching to an isomorphic
-// location in a different region.
-TEST(IRInstructionMapper, BranchIllegal) {
+// Checks that the branch is mapped to legal when the option is set.
+TEST(IRInstructionMapper, BranchLegal) {
   StringRef ModuleString = R"(
                           define i32 @f(i32 %a, i32 %b) {
                           bb0:
@@ -759,12 +745,28 @@
   SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
   SpecificBumpPtrAllocator<IRInstructionDataList> IDLAllocator;
   IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator);
+  Mapper.InstClassifier.EnableBranches = true;
+  Mapper.initializeForBBs(*M);
   getVectors(*M, Mapper, InstrList, UnsignedVec);
 
   ASSERT_EQ(InstrList.size(), UnsignedVec.size());
-  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(0));
+  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(3));
+  ASSERT_TRUE(UnsignedVec[1] > UnsignedVec[0]);
+  ASSERT_TRUE(UnsignedVec[1] < UnsignedVec[2]);
 }
 
+// In most cases, the illegal instructions we are collecting don't require any
+// sort of setup.  In these cases, we can just only have illegal instructions,
+// and the mapper will create 0 length vectors, and we can check that.
+
+// In cases where we have legal instructions needed to set up the illegal
+// instruction, to check illegal instructions are assigned unsigned integers
+// from the maximum value decreasing to 0, it will be greater than a legal
+// instruction that comes after.  So to check that we have an illegal
+// instruction, we place a legal instruction after an illegal instruction, and
+// check that the illegal unsigned integer is greater than the unsigned integer
+// of the legal instruction.
+
 // Checks that a PHINode is mapped to be illegal since there is extra checking
 // needed to ensure that a branch in one region is bin an isomorphic
 // location in a different region.
@@ -1445,8 +1447,8 @@
   getVectors(*M, Mapper, InstrList, UnsignedVec);
 
   ASSERT_EQ(InstrList.size(), UnsignedVec.size());
-  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(6));
-  ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]);
+  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(7));
+  ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[0]);
 }
 
 // Checks that a memcpy instruction is mapped to an illegal value.
@@ -1476,8 +1478,10 @@
   getVectors(*M, Mapper, InstrList, UnsignedVec);
 
   ASSERT_EQ(InstrList.size(), UnsignedVec.size());
-  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(6));
-  ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]);
+  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(7));
+  errs() << UnsignedVec[1] << " " << UnsignedVec[2] << " " << UnsignedVec[3] << "\n";
+  ASSERT_TRUE(UnsignedVec[2] > UnsignedVec[3]);
+  ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[0]);
 }
 
 // Checks that a memmove instruction is mapped to an illegal value.
@@ -1507,8 +1511,8 @@
   getVectors(*M, Mapper, InstrList, UnsignedVec);
 
   ASSERT_EQ(InstrList.size(), UnsignedVec.size());
-  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(6));
-  ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[1]);
+  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(7));
+  ASSERT_TRUE(UnsignedVec[2] < UnsignedVec[0]);
 }
 
 // Checks that a variable argument instructions are mapped to an illegal value.
@@ -1555,11 +1559,10 @@
   getVectors(*M, Mapper, InstrList, UnsignedVec);
 
   ASSERT_EQ(InstrList.size(), UnsignedVec.size());
-  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(16));
-  ASSERT_TRUE(UnsignedVec[4] < UnsignedVec[3]);
-  ASSERT_TRUE(UnsignedVec[7] < UnsignedVec[6]);
-  ASSERT_TRUE(UnsignedVec[10] < UnsignedVec[9]);
-  ASSERT_TRUE(UnsignedVec[13] < UnsignedVec[12]);
+  ASSERT_EQ(UnsignedVec.size(), static_cast<unsigned>(17));
+  ASSERT_TRUE(UnsignedVec[7] < UnsignedVec[0]);
+  ASSERT_TRUE(UnsignedVec[13] < UnsignedVec[10]);
+  ASSERT_TRUE(UnsignedVec[16] < UnsignedVec[13]);
 }
 
 // Check the length of adding two illegal instructions one after th other.  We
@@ -1599,21 +1602,24 @@
 // two blocks of two legal instructions and terminator, and checks them for
 // instruction similarity.
 static bool longSimCandCompare(std::vector<IRInstructionData *> &InstrList,
-                               bool Structure = false) {
+                               bool Structure = false,
+                               unsigned Length=2,
+                               unsigned StartIdxOne=0,
+                               unsigned StartIdxTwo=3) {
   std::vector<IRInstructionData *>::iterator Start, End;
 
   Start = InstrList.begin();
   End = InstrList.begin();
 
-  std::advance(End, 1);
-  IRSimilarityCandidate Cand1(0, 2, *Start, *End);
+  std::advance(End, StartIdxOne + Length - 1);
+  IRSimilarityCandidate Cand1(StartIdxOne, Length, *Start, *End);
 
   Start = InstrList.begin();
   End = InstrList.begin();
 
-  std::advance(Start, 3);
-  std::advance(End, 4);
-  IRSimilarityCandidate Cand2(3, 2, *Start, *End);
+  std::advance(Start, StartIdxTwo);
+  std::advance(End, StartIdxTwo + Length - 1);
+  IRSimilarityCandidate Cand2(StartIdxTwo, Length, *Start, *End);
   if (Structure)
     return IRSimilarityCandidate::compareStructure(Cand1, Cand2);
   return IRSimilarityCandidate::isSimilar(Cand1, Cand2);
@@ -2083,6 +2089,206 @@
   ASSERT_TRUE(longSimCandCompare(InstrList, true));
 }
 
+// Checks that the same structure is recognized between two candidates when
+// the branches target other blocks inside the same region, the relative
+// distance between the blocks must be the same.
+TEST(IRSimilarityCandidate, SameBranchStructureInternal) {
+  StringRef ModuleString = R"(
+                          define i32 @f(i32 %a, i32 %b) {
+                          bb0:
+                             %0 = add i32 %a, %b
+                             %1 = add i32 %b, %a
+                             br label %bb1
+                          bb1:
+                             %2 = add i32 %b, %a
+                             %3 = add i32 %a, %b
+                             ret i32 0
+                          }
+                          
+                          define i32 @f2(i32 %a, i32 %b) {
+                          bb0:
+                             %0 = add i32 %a, %b
+                             %1 = add i32 %b, %a
+                             br label %bb1
+                          bb1:
+                             %2 = add i32 %b, %a
+                             %3 = add i32 %a, %b
+                             ret i32 0
+                          })";
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+  std::vector<IRInstructionData *> InstrList;
+  std::vector<unsigned> UnsignedVec;
+
+  SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
+  SpecificBumpPtrAllocator<IRInstructionDataList> IDLAllocator;
+  IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator);
+  Mapper.InstClassifier.EnableBranches = true;
+  Mapper.initializeForBBs(*M);
+  getVectors(*M, Mapper, InstrList, UnsignedVec);
+
+  // Check to make sure that we have a long enough region.
+  ASSERT_EQ(InstrList.size(), static_cast<unsigned>(12));
+  // Check that the instructions were added correctly to both vectors.
+  ASSERT_TRUE(InstrList.size() == UnsignedVec.size());
+
+  ASSERT_TRUE(longSimCandCompare(InstrList, true, 5, 0, 6));
+}
+
+// Checks that the different structure is recognized between two candidates,
+// when the branches target other blocks inside the same region, the relative
+// distance between the blocks must be the same.
+TEST(IRSimilarityCandidate, DifferentBranchStructureInternal) {
+  StringRef ModuleString = R"(
+                          define i32 @f(i32 %a, i32 %b) {
+                          bb0:
+                             %0 = add i32 %a, %b
+                             %1 = add i32 %b, %a
+                             br label %bb2
+                          bb1:
+                             %2 = add i32 %b, %a
+                             %3 = add i32 %a, %b
+                             br label %bb2
+                          bb2:
+                             %4 = add i32 %b, %a
+                             %5 = add i32 %a, %b
+                             ret i32 0
+                          }
+                          
+                          define i32 @f2(i32 %a, i32 %b) {
+                          bb0:
+                             %0 = add i32 %a, %b
+                             %1 = add i32 %b, %a
+                             br label %bb1
+                          bb1:
+                             %2 = add i32 %b, %a
+                             %3 = add i32 %a, %b
+                             br label %bb2
+                          bb2:
+                             %4 = add i32 %b, %a
+                             %5 = add i32 %a, %b
+                             ret i32 0
+                          })";
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+  std::vector<IRInstructionData *> InstrList;
+  std::vector<unsigned> UnsignedVec;
+
+  SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
+  SpecificBumpPtrAllocator<IRInstructionDataList> IDLAllocator;
+  IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator);
+  Mapper.InstClassifier.EnableBranches = true;
+  Mapper.initializeForBBs(*M);
+  getVectors(*M, Mapper, InstrList, UnsignedVec);
+
+  // Check to make sure that we have a long enough region.
+  ASSERT_EQ(InstrList.size(), static_cast<unsigned>(18));
+  // Check that the instructions were added correctly to both vectors.
+  ASSERT_TRUE(InstrList.size() == UnsignedVec.size());
+
+  ASSERT_FALSE(longSimCandCompare(InstrList, true, 6, 0, 9));
+}
+
+// Checks that the same structure is recognized between two candidates, when
+// the branches target other blocks outside region, the relative distance
+// does not need to be the same.
+TEST(IRSimilarityCandidate, SameBranchStructureOutside) {
+  StringRef ModuleString = R"(
+                          define i32 @f(i32 %a, i32 %b) {
+                          bb0:
+                             %0 = add i32 %a, %b
+                             %1 = add i32 %b, %a
+                             br label %bb1
+                          bb1:
+                             %2 = add i32 %b, %a
+                             %3 = add i32 %a, %b
+                             ret i32 0
+                          }
+                          
+                          define i32 @f2(i32 %a, i32 %b) {
+                          bb0:
+                             %0 = add i32 %a, %b
+                             %1 = add i32 %b, %a
+                             br label %bb1
+                          bb1:
+                             %2 = add i32 %b, %a
+                             %3 = add i32 %a, %b
+                             ret i32 0
+                          })";
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+  std::vector<IRInstructionData *> InstrList;
+  std::vector<unsigned> UnsignedVec;
+
+  SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
+  SpecificBumpPtrAllocator<IRInstructionDataList> IDLAllocator;
+  IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator);
+  Mapper.InstClassifier.EnableBranches = true;
+  Mapper.initializeForBBs(*M);
+  getVectors(*M, Mapper, InstrList, UnsignedVec);
+
+  // Check to make sure that we have a long enough region.
+  ASSERT_EQ(InstrList.size(), static_cast<unsigned>(12));
+  // Check that the instructions were added correctly to both vectors.
+  ASSERT_TRUE(InstrList.size() == UnsignedVec.size());
+
+  ASSERT_TRUE(longSimCandCompare(InstrList, true, 3, 0, 6));
+}
+
+// Checks that the same structure is recognized between two candidates, when
+// the branches target other blocks outside region, the relative distance
+// does not need to be the same.
+TEST(IRSimilarityCandidate, DifferentBranchStructureOutside) {
+  StringRef ModuleString = R"(
+                          define i32 @f(i32 %a, i32 %b) {
+                          bb0:
+                             %0 = add i32 %a, %b
+                             %1 = add i32 %b, %a
+                             br label %bb1
+                          bb1:
+                             %2 = add i32 %b, %a
+                             %3 = add i32 %a, %b
+                             ret i32 0
+                          }
+                          
+                          define i32 @f2(i32 %a, i32 %b) {
+                          bb0:
+                             %0 = add i32 %a, %b
+                             %1 = add i32 %b, %a
+                             br label %bb2
+                          bb1:
+                             %2 = add i32 %b, %a
+                             %3 = add i32 %a, %b
+                             br label %bb2
+                          bb2:
+                             %4 = add i32 %b, %a
+                             %5 = add i32 %a, %b
+                             ret i32 0
+                          })";
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+  std::vector<IRInstructionData *> InstrList;
+  std::vector<unsigned> UnsignedVec;
+
+  SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
+  SpecificBumpPtrAllocator<IRInstructionDataList> IDLAllocator;
+  IRInstructionMapper Mapper(&InstDataAllocator, &IDLAllocator);
+  Mapper.InstClassifier.EnableBranches = true;
+  Mapper.initializeForBBs(*M);
+  getVectors(*M, Mapper, InstrList, UnsignedVec);
+
+  // Check to make sure that we have a long enough region.
+  ASSERT_EQ(InstrList.size(), static_cast<unsigned>(15));
+  // Check that the instructions were added correctly to both vectors.
+  ASSERT_TRUE(InstrList.size() == UnsignedVec.size());
+
+  ASSERT_TRUE(longSimCandCompare(InstrList, true, 3, 0, 6));
+}
+
 // Checks that two sets of identical instructions are found to be the same.
 // Both sequences of adds have the same operand ordering, and the same
 // instructions, making them strcturally equivalent.