diff --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h
--- a/llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -73,6 +73,10 @@
   /// The number of extracted inputs from the CodeExtractor.
   unsigned NumExtractedInputs;
 
+  /// The corresponding BasicBlock with the appropriate stores for this
+  /// OutlinableRegion in the overall function.
+  unsigned OutputBlockNum;
+
   /// Mapping the extracted argument number to the argument number in the
   /// overall function.  Since there will be inputs, such as elevated constants
   /// that are not the same in each region in a SimilarityGroup, or values that
@@ -87,6 +91,11 @@
   /// since the CodeExtractor does not recognize constants.
   DenseMap<unsigned, Constant *> AggArgToConstant;
 
+  /// The global value numbers that are used as outputs for this section. Once
+  /// extracted, each output will be stored to an output register.  This
+  /// documents the global value numbers that are used in this pattern.
+  SmallVector<unsigned, 4> GVNStores;
+
   /// Used to create an outlined function.
   CodeExtractor *CE = nullptr;
 
@@ -192,6 +201,15 @@
   void findAddInputsOutputs(Module &M, OutlinableRegion &Region,
                             DenseSet<unsigned> &NotSame);
 
+  /// Update the output mapping based on the load instruction, and the outputs
+  /// of the extracted function.
+  ///
+  /// \param Region - The region extracted
+  /// \param Outputs - The outputs from the extracted function.
+  /// \param LI - The load instruction used to update the mapping.
+  void updateOutputMapping(OutlinableRegion &Region,
+                           ArrayRef<Value *> Outputs, LoadInst *LI);
+
   /// Extract \p Region into its own function.
   ///
   /// \param [in] Region - The region to be extracted into its own function.
@@ -218,6 +236,11 @@
   /// TargetTransformInfo lambda for target specific information.
   function_ref<TargetTransformInfo &(Function &)> getTTI;
 
+  /// A mapping from newly created reloaded output values to the original value.
+  /// If an value is replace by an output from an outlined region, this maps
+  /// that Value, back to its original Value.
+  DenseMap<Value *, Value *> OutputMappings;
+
   /// IRSimilarityIdentifier lambda to retrieve IRSimilarityIdentifier.
   function_ref<IRSimilarityIdentifier &(Module &)> getIRSI;
 
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -50,6 +50,9 @@
   /// for extraction.
   bool IgnoreGroup = false;
 
+  /// The return block for the overall function.
+  BasicBlock *EndBB = nullptr;
+
   /// Flag for whether the \ref ArgumentTypes have been defined after the
   /// extraction of the first region.
   bool InputTypesSet = false;
@@ -343,18 +346,48 @@
 /// CodeExtractor.
 /// \param [out] EndInputNumbers - The global value numbers for the extracted
 /// arguments.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value.
+/// \param [out] EndInputs - The global value numbers for the extracted
+/// arguments.
 static void mapInputsToGVNs(IRSimilarityCandidate &C,
                             SetVector<Value *> &CurrentInputs,
+                            const DenseMap<Value *, Value *> &OutputMappings,
                             std::vector<unsigned> &EndInputNumbers) {
-  // Get the global value number for each input.
+  // Get the Global Value Number for each input.  We check if the Value has been
+  // replaced by a different value at output, and use the original value before
+  // replacement.
   for (Value *Input : CurrentInputs) {
     assert(Input && "Have a nullptr as an input");
+    if (OutputMappings.find(Input) != OutputMappings.end())
+      Input = OutputMappings.find(Input)->second;
     assert(C.getGVN(Input).hasValue() &&
            "Could not find a numbering for the given input");
     EndInputNumbers.push_back(C.getGVN(Input).getValue());
   }
 }
 
+/// Find the original value for the \p ArgInput values if any one of them was
+/// replaced during a previous extraction.
+///
+/// \param [in] ArgInputs - The inputs to be extracted by the code extractor.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value.
+/// \param [out] RemappedArgInputs - The remapped values according to
+/// \p OutputMappings that will be extracted.
+static void
+remapExtractedInputs(const ArrayRef<Value *> ArgInputs,
+                     const DenseMap<Value *, Value *> &OutputMappings,
+                     SetVector<Value *> &RemappedArgInputs) {
+  // Get the global value number for each input that will be extracted as an
+  // argument by the code extractor, remapping if needed for reloaded values.
+  for (Value *Input : ArgInputs) {
+    if (OutputMappings.find(Input) != OutputMappings.end())
+      Input = OutputMappings.find(Input)->second;
+    RemappedArgInputs.insert(Input);
+  }
+}
+
 /// Find the input GVNs and the output values for a region of Instructions.
 /// Using the code extractor, we collect the inputs to the extracted function.
 ///
@@ -368,19 +401,25 @@
 /// \param [in] NotSame - The global value numbers in the region that do not
 /// have the same constant value in the regions structurally similar to
 /// \p Region.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value after extraction.
 /// \param [out] ArgInputs - The values of the inputs to the extracted function.
-static void getCodeExtractorArguments(OutlinableRegion &Region,
-                                      std::vector<unsigned> &InputGVNs,
-                                      DenseSet<unsigned> &NotSame,
-                                      SetVector<Value *> &ArgInputs) {
+/// \param [out] Outputs - The set of values extracted by the CodeExtractor
+/// as outputs.
+static void getCodeExtractorArguments(
+    OutlinableRegion &Region, std::vector<unsigned> &InputGVNs,
+    DenseSet<unsigned> &NotSame, DenseMap<Value *, Value *> &OutputMappings,
+    SetVector<Value *> &ArgInputs, SetVector<Value *> &Outputs) {
   IRSimilarityCandidate &C = *Region.Candidate;
 
   // OverallInputs are the inputs to the region found by the CodeExtractor,
   // SinkCands and HoistCands are used by the CodeExtractor to find sunken
   // allocas of values whose lifetimes are contained completely within the
-  // outlined region. Outputs are values used outside of the outlined region
-  // found by the CodeExtractor.
-  SetVector<Value *> OverallInputs, SinkCands, HoistCands, Outputs;
+  // outlined region. PremappedInputs are the arguments found by the
+  // CodeExtractor, removing conditions such as sunken allocas, but that
+  // may need to be remapped due to the extracted output values replacing
+  // the original values.
+  SetVector<Value *> OverallInputs, PremappedInputs, SinkCands, HoistCands;
 
   // Use the code extractor to get the inputs and outputs, without sunken
   // allocas or removing llvm.assumes.
@@ -400,27 +439,24 @@
 
   // Find if any values are going to be sunk into the function when extracted
   CE->findAllocas(CEAC, SinkCands, HoistCands, Dummy);
-  CE->findInputsOutputs(ArgInputs, Outputs, SinkCands);
-
-  // TODO: Support regions with output values.  Outputs add an extra layer of
-  // resolution that adds too much complexity at this stage.
-  if (Outputs.size() > 0) {
-    Region.IgnoreRegion = true;
-    return;
-  }
+  CE->findInputsOutputs(PremappedInputs, Outputs, SinkCands);
 
   // TODO: Support regions with sunken allocas: values whose lifetimes are
   // contained completely within the outlined region.  These are not guaranteed
   // to be the same in every region, so we must elevate them all to arguments
   // when they appear.  If these values are not equal, it means there is some
   // Input in OverallInputs that was removed for ArgInputs.
-  if (ArgInputs.size() != OverallInputs.size()) {
+  if (OverallInputs.size() != PremappedInputs.size()) {
     Region.IgnoreRegion = true;
     return;
   }
 
   findConstants(C, NotSame, InputGVNs);
-  mapInputsToGVNs(C, OverallInputs, InputGVNs);
+
+  mapInputsToGVNs(C, OverallInputs, OutputMappings, InputGVNs);
+
+  remapExtractedInputs(PremappedInputs.getArrayRef(), OutputMappings,
+                       ArgInputs);
 
   // Sort the GVNs, since we now have constants included in the \ref InputGVNs
   // we need to make sure they are in a deterministic order.
@@ -439,7 +475,7 @@
 /// function.
 static void
 findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
-                                        std::vector<unsigned> InputGVNs,
+                                        std::vector<unsigned> &InputGVNs,
                                         SetVector<Value *> &ArgInputs) {
 
   IRSimilarityCandidate &C = *Region.Candidate;
@@ -494,12 +530,82 @@
   Region.NumExtractedInputs = OriginalIndex;
 }
 
+/// Create a mapping of the output arguments for the \p Region to the output
+/// arguments of the overall outlined function.
+///
+/// \param [in,out] Region - The region of code to be analyzed.
+/// \param [in] Outputs - The values found by the code extractor.
+static void
+findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
+                                          ArrayRef<Value *> Outputs) {
+  OutlinableGroup &Group = *Region.Parent;
+  IRSimilarityCandidate &C = *Region.Candidate;
+
+  // This counts the argument number in the extracted function.
+  unsigned OriginalIndex = Region.NumExtractedInputs;
+
+  // This counts the argument number in the overall function.
+  unsigned TypeIndex = Group.NumAggregateInputs;
+  bool TypeFound;
+  DenseSet<unsigned> AggArgsUsed;
+
+  // Iterate over the output types and identify if there is an aggregate pointer
+  // type whose base type matches the current output type. If there is, we mark
+  // that we will use this output register for this value. If not we add another
+  // type to the overall argument type list. We also store the GVNs used for
+  // stores to identify which values will need to be moved into an special
+  // block that holds the stores to the output registers.
+  for (Value *Output : Outputs) {
+    TypeFound = false;
+    // We can do this since it is a result value, and will have a number
+    // that is necessarily the same. BUT if in the future, the instructions
+    // do not have to be in same order, but are functionally the same, we will
+    // have to use a different scheme, as one-to-one correspondence is not
+    // guaranteed.
+    unsigned GlobalValue = C.getGVN(Output).getValue();
+    unsigned ArgumentSize = Group.ArgumentTypes.size();
+
+    for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
+      if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
+        continue;
+
+      if (AggArgsUsed.find(Jdx) != AggArgsUsed.end())
+        continue;
+
+      TypeFound = true;
+      AggArgsUsed.insert(Jdx);
+      Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
+      Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
+      Region.GVNStores.push_back(GlobalValue);
+      break;
+    }
+
+    // We were unable to find an unused type in the output type set that matches
+    // the output, so we add a pointer type to the argument types of the overall
+    // function to handle this output and create a mapping to it.
+    if (!TypeFound) {
+      Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType()));
+      AggArgsUsed.insert(Group.ArgumentTypes.size() - 1);
+      Region.ExtractedArgToAgg.insert(
+          std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
+      Region.AggArgToExtracted.insert(
+          std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
+      Region.GVNStores.push_back(GlobalValue);
+    }
+
+    stable_sort(Region.GVNStores);
+    OriginalIndex++;
+    TypeIndex++;
+  }
+}
+
 void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
                                       DenseSet<unsigned> &NotSame) {
   std::vector<unsigned> Inputs;
-  SetVector<Value *> ArgInputs;
+  SetVector<Value *> ArgInputs, Outputs;
 
-  getCodeExtractorArguments(Region, Inputs, NotSame, ArgInputs);
+  getCodeExtractorArguments(Region, Inputs, NotSame, OutputMappings, ArgInputs,
+                            Outputs);
 
   if (Region.IgnoreRegion)
     return;
@@ -507,6 +613,10 @@
   // Map the inputs found by the CodeExtractor to the arguments found for
   // the overall function.
   findExtractedInputToOverallInputMapping(Region, Inputs, ArgInputs);
+
+  // Map the outputs found by the CodeExtractor to the arguments found for
+  // the overall function.
+  findExtractedOutputToOverallOutputMapping(Region, Outputs.getArrayRef());
 }
 
 /// Replace the extracted function in the Region with a call to the overall
@@ -544,6 +654,18 @@
   // new argument list.
   for (unsigned AggArgIdx = 0; AggArgIdx < AggFunc->arg_size(); AggArgIdx++) {
 
+    if (AggArgIdx == AggFunc->arg_size() - 1 &&
+        Group.ArgumentTypes.size() > Group.NumAggregateInputs) {
+      // If we are on the last argument, and we need to differentiate between
+      // output blocks, add an integer to the argument list to determine
+      // what block to take
+      LLVM_DEBUG(dbgs() << "Set switch block argument to "
+                        << Region.OutputBlockNum << "\n");
+      NewCallArgs.push_back(ConstantInt::get(Type::getInt32Ty(M.getContext()),
+                                             Region.OutputBlockNum));
+      continue;
+    }
+
     ArgPair = Region.AggArgToExtracted.find(AggArgIdx);
     if (ArgPair != Region.AggArgToExtracted.end()) {
       Value *ArgumentValue = Call->getArgOperand(ArgPair->second);
@@ -603,8 +725,11 @@
 // Within an extracted function, replace the argument uses of the extracted
 // region with the arguments of the function for an OutlinableGroup.
 //
-// \param OS [in] - The region of extracted code to be changed.
-static void replaceArgumentUses(OutlinableRegion &Region) {
+/// \param [in] Region - The region of extracted code to be changed.
+/// \param [in,out] OutputBB - The BasicBlock for the output stores for this
+/// region.
+static void replaceArgumentUses(OutlinableRegion &Region,
+                                BasicBlock *OutputBB) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
 
@@ -618,7 +743,29 @@
     Argument *Arg = Region.ExtractedFunction->getArg(ArgIdx);
     // The argument is an input, so we can simply replace it with the overall
     // argument value
-    LLVM_DEBUG(dbgs() << "Replacing uses of input " << *Arg << " in function "
+    if (ArgIdx < Region.NumExtractedInputs) {
+      LLVM_DEBUG(dbgs() << "Replacing uses of input " << *Arg << " in function "
+                        << *Region.ExtractedFunction << " with " << *AggArg
+                        << " in function " << *Group.OutlinedFunction << "\n");
+      Arg->replaceAllUsesWith(AggArg);
+      continue;
+    }
+
+    // If we are replacing an output, we place the store value in its own
+    // block inside the overall function before replacing the use of the output
+    // in the function.
+    assert(Arg->hasOneUse() && "Output argument can only have one use");
+    User *InstAsUser = Arg->user_back();
+    assert(InstAsUser && "User is nullptr!");
+
+    Instruction *I = cast<Instruction>(InstAsUser);
+    I->setDebugLoc(DebugLoc());
+    LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
+                      << *OutputBB << "\n");
+
+    I->moveBefore(*OutputBB, OutputBB->end());
+
+    LLVM_DEBUG(dbgs() << "Replacing uses of output " << *Arg << " in function "
                       << *Region.ExtractedFunction << " with " << *AggArg
                       << " in function " << *Group.OutlinedFunction << "\n");
     Arg->replaceAllUsesWith(AggArg);
@@ -656,38 +803,181 @@
   }
 }
 
+/// For the given function, find all the nondebug or lifetime instructions,
+/// and return them as a vector. Exclude any blocks in \p ExludeBlocks.
+///
+/// \param [in] F - The function we collect the instructions from.
+/// \param [in] ExcludeBlocks - BasicBlocks to ignore.
+/// \returns the list of instructions extracted.
+static std::vector<Instruction *>
+collectRelevantInstructions(Function &F,
+                            DenseSet<BasicBlock *> &ExcludeBlocks) {
+  std::vector<Instruction *> RelevantInstructions;
+
+  for (BasicBlock &BB : F) {
+    if (ExcludeBlocks.find(&BB) != ExcludeBlocks.end())
+      continue;
+
+    for (Instruction &Inst : BB) {
+      if (Inst.isLifetimeStartOrEnd())
+        continue;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        continue;
+
+      RelevantInstructions.push_back(&Inst);
+    }
+  }
+
+  return RelevantInstructions;
+}
+
+/// For the outlined section, move needed the StoreInsts for the output
+/// registers into their own block.  Then, determine if there is a duplicate
+/// output block already created.
+///
+/// \param [in] OG - The OutlinableGroup of regions to be outlined.
+/// \param [in] Region - The OutlinableRegion that is being analyzed.
+/// \param [in,out] OutputBB - the block that stores for this region will be
+/// placed in.
+/// \param [in] EndBB - the final block of the extracted function.
+/// \param [in] OutputMappings - OutputMappings the mapping of values that have
+/// been replaced by a new output value.
+/// \param [in,out] OutputStoreBBs - The existing output blocks.
+static void
+alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
+                            BasicBlock *OutputBB, BasicBlock *EndBB,
+                            const DenseMap<Value *, Value *> &OutputMappings,
+                            std::vector<BasicBlock *> &OutputStoreBBs) {
+  DenseSet<unsigned> ValuesToFind(Region.GVNStores.begin(),
+                                  Region.GVNStores.end());
+
+  // We iterate over the instructions in the extracted function, and find the
+  // global value number of the instructions.  If we find a value that should
+  // be contained in a store, we replace the uses of the value with the value
+  // from the overall function, so that the store is storing the correct
+  // value from the overall function.
+
+  DenseSet<BasicBlock *> ExcludeBBs(OutputStoreBBs.begin(),
+                                    OutputStoreBBs.end());
+  std::vector<Instruction *> ExtractedFunctionInsts =
+      collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs);
+  std::vector<Instruction *> OverallFunctionInsts =
+      collectRelevantInstructions(*OG.OutlinedFunction, ExcludeBBs);
+
+  assert(ExtractedFunctionInsts.size() == OverallFunctionInsts.size() &&
+         "Number of relevant instructions not equal!");
+
+  unsigned NumInstructions = ExtractedFunctionInsts.size();
+  for (unsigned Idx = 0; Idx < NumInstructions; Idx++) {
+    Value *V = ExtractedFunctionInsts[Idx];
+
+    if (OutputMappings.find(V) != OutputMappings.end())
+      V = OutputMappings.find(V)->second;
+    Optional<unsigned> GVN = Region.Candidate->getGVN(V);
+
+    // If we have found one of the stored values for output, replace the value
+    // with the corresponding one from the overall function.
+    if (GVN.hasValue() &&
+        ValuesToFind.find(GVN.getValue()) != ValuesToFind.end()) {
+      ValuesToFind.erase(GVN.getValue());
+      V->replaceAllUsesWith(OverallFunctionInsts[Idx]);
+      if (ValuesToFind.size() == 0)
+        break;
+    }
+
+    if (ValuesToFind.size() == 0)
+      break;
+  }
+
+  assert(ValuesToFind.size() == 0 && "Not all store values were handled!");
+}
+
+/// Create the switch statement for outlined function to differentiate between
+/// all the output blocks.
+///
+/// For the outlined section, determine if an outlined block already exists that
+/// matches the needed stores for the extracted section.
+/// \param [in] M - The module we are outlining from.
+/// \param [in] OG - The group of regions to be outlined.
+/// \param [in] OS - The region that is being analyzed.
+/// \param [in] EndBB - The final block of the extracted function.
+/// \param [in,out] OutputStoreBBs - The existing output blocks.
+void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
+                           ArrayRef<BasicBlock *> OutputStoreBBs) {
+  Function *AggFunc = OG.OutlinedFunction;
+  // Create a final block
+  BasicBlock *ReturnBlock =
+      BasicBlock::Create(M.getContext(), "final_block", AggFunc);
+  Instruction *Term = EndBB->getTerminator();
+  Term->moveBefore(*ReturnBlock, ReturnBlock->end());
+  // Put the switch statement in the old end basic block for the function with
+  // a fall through to the new return block
+  LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
+                    << OutputStoreBBs.size() << "\n");
+  SwitchInst *SwitchI =
+      SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1), ReturnBlock,
+                         OutputStoreBBs.size(), EndBB);
+
+  unsigned Idx = 0;
+  for (BasicBlock *BB : OutputStoreBBs) {
+    SwitchI->addCase(ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx),
+                     BB);
+    Term = BB->getTerminator();
+    Term->setSuccessor(0, ReturnBlock);
+    Idx++;
+  }
+
+  return;
+}
+
 /// Fill the new function that will serve as the replacement function for all of
 /// the extracted regions of a certain structure from the first region in the
 /// list of regions.  Replace this first region's extracted function with the
 /// new overall function.
 ///
-/// \param M [in] - The module we are outlining from.
-/// \param CurrentGroup [in] - The group of regions to be outlined.
-/// \param FuncsToRemove [in,out] - Extracted functions to erase from module
+/// \param [in] M - The module we are outlining from.
+/// \param [in] CurrentGroup - The group of regions to be outlined.
+/// \param [in,out] OutputStoreBBs - The output blocks for each different
+/// set of stores needed for the different functions.
+/// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
 static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup,
+                                std::vector<BasicBlock *> &OutputStoreBBs,
                                 std::vector<Function *> &FuncsToRemove) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
-  // Move first extracted function's instructions into new function
+  // Move first extracted function's instructions into new function.
   LLVM_DEBUG(dbgs() << "Move instructions from "
                     << *CurrentOS->ExtractedFunction << " to instruction "
                     << *CurrentGroup.OutlinedFunction << "\n");
-  moveFunctionData(*CurrentOS->ExtractedFunction,
-                   *CurrentGroup.OutlinedFunction);
 
-  // Transfer the attributes
+  CurrentGroup.EndBB = moveFunctionData(*CurrentOS->ExtractedFunction,
+                                        *CurrentGroup.OutlinedFunction);
+
+  // Transfer the attributes from the function to the new function.
   for (Attribute A :
        CurrentOS->ExtractedFunction->getAttributes().getFnAttributes())
     CurrentGroup.OutlinedFunction->addFnAttr(A);
 
-  replaceArgumentUses(*CurrentOS);
+  // Create an output block for the first extracted function.
+  BasicBlock *NewBB = BasicBlock::Create(
+      M.getContext(), Twine("output_block_") + Twine(static_cast<unsigned>(0)),
+      CurrentGroup.OutlinedFunction);
+  CurrentOS->OutputBlockNum = 0;
+
+  replaceArgumentUses(*CurrentOS, NewBB);
   replaceConstants(*CurrentOS);
 
+  if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs) {
+    BranchInst::Create(CurrentGroup.EndBB, NewBB);
+    OutputStoreBBs.push_back(NewBB);
+  } else
+    NewBB->eraseFromParent();
+
   // Replace the call to the extracted function with the outlined function.
   CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
 
-  // We only delete the extracted funcitons at the end since we may need to
+  // We only delete the extracted functions at the end since we may need to
   // reference instructions contained in them for mapping purposes.
   FuncsToRemove.push_back(CurrentOS->ExtractedFunction);
 }
@@ -701,17 +991,35 @@
 
   OutlinableRegion *CurrentOS;
 
-  fillOverallFunction(M, CurrentGroup, FuncsToRemove);
+  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
 
-  // Do the same for the other extracted functions
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
     CurrentOS = CurrentGroup.Regions[Idx];
 
-    replaceArgumentUses(*CurrentOS);
+    // Create a new BasicBlock to hold the needed store instructions.
+    BasicBlock *NewBB = BasicBlock::Create(
+        M.getContext(), "output_block_" + std::to_string(Idx),
+        CurrentGroup.OutlinedFunction);
+    replaceArgumentUses(*CurrentOS, NewBB);
+
+    if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs) {
+      BranchInst::Create(CurrentGroup.EndBB, NewBB);
+      CurrentOS->OutputBlockNum = OutputStoreBBs.size();
+      OutputStoreBBs.push_back(NewBB);
+      alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBB,
+                                  CurrentGroup.EndBB, OutputMappings,
+                                  OutputStoreBBs);
+    } else
+      NewBB->eraseFromParent();
+
     CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
     FuncsToRemove.push_back(CurrentOS->ExtractedFunction);
   }
 
+  // Create a switch statement to handle the different output schemes.
+  if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs)
+    createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBB, OutputStoreBBs);
+
   OutlinedFunctionNum++;
 }
 
@@ -766,11 +1074,45 @@
   }
 }
 
+void IROutliner::updateOutputMapping(OutlinableRegion &Region,
+                                     ArrayRef<Value *> Outputs,
+                                     LoadInst *LI) {
+  // For and load instructions following the call
+  Value *Operand = LI->getPointerOperand();
+  Optional<unsigned> OutputIdx = None;
+  // Find if the operand it is an output register.
+  for (unsigned ArgIdx = Region.NumExtractedInputs;
+       ArgIdx < Region.Call->arg_size(); ArgIdx++) {
+    if (Operand == Region.Call->getArgOperand(ArgIdx)) {
+      OutputIdx = ArgIdx - Region.NumExtractedInputs;
+      break;
+    }
+  }
+
+  // If we found an output register, place a mapping of the new value
+  // to the original in the mapping.
+  if (!OutputIdx.hasValue())
+    return;
+
+  if (OutputMappings.find(Outputs[OutputIdx.getValue()]) ==
+      OutputMappings.end()) {
+    LLVM_DEBUG(dbgs() << "Mapping extracted output " << *LI << " to "
+                      << *Outputs[OutputIdx.getValue()] << "\n");
+    OutputMappings.insert(std::make_pair(LI, Outputs[OutputIdx.getValue()]));
+  } else {
+    Value *Orig = OutputMappings.find(Outputs[OutputIdx.getValue()])->second;
+    LLVM_DEBUG(dbgs() << "Mapping extracted output " << *Orig << " to "
+                      << *Outputs[OutputIdx.getValue()] << "\n");
+    OutputMappings.insert(std::make_pair(LI, Orig));
+  }
+}
+
 bool IROutliner::extractSection(OutlinableRegion &Region) {
-  assert(Region.StartBB != nullptr &&
-         "StartBB for the OutlinableRegion is nullptr!");
-  assert(Region.FollowBB != nullptr &&
-         "StartBB for the OutlinableRegion is nullptr!");
+  SetVector<Value *> ArgInputs, Outputs, SinkCands;
+  Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands);
+
+  assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!");
+  assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!");
   Function *OrigF = Region.StartBB->getParent();
   CodeExtractorAnalysisCache CEAC(*OrigF);
   Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC);
@@ -816,17 +1158,17 @@
   // Iterate over the new set of instructions to find the new call
   // instruction.
   for (Instruction &I : *RewrittenBB)
-    if (CallInst *CI = dyn_cast<CallInst>(&I))
-      if (Region.ExtractedFunction == CI->getCalledFunction()) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (Region.ExtractedFunction == CI->getCalledFunction())
         Region.Call = CI;
-        break;
-      }
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+      updateOutputMapping(Region, Outputs.getArrayRef(), LI);
   Region.reattachCandidate();
   return true;
 }
 
 unsigned IROutliner::doOutline(Module &M) {
-  // Find the possibile similarity sections.
+  // Find the possible similarity sections.
   IRSimilarityIdentifier &Identifier = getIRSI(M);
   SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity();
 
@@ -886,6 +1228,15 @@
 
     CurrentGroup.Regions = std::move(OutlinedRegions);
 
+    if (CurrentGroup.Regions.empty())
+      continue;
+
+    // We are adding an extracted argument to decide between which output path
+    // to use in the basic block.  It is used in a switch statement and only
+    // needs to be an integer.
+    if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs)
+      CurrentGroup.ArgumentTypes.push_back(Type::getInt32Ty(M.getContext()));
+
     // Create functions out of all the sections, and mark them as outlined.
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
diff --git a/llvm/test/Transforms/IROutliner/extraction.ll b/llvm/test/Transforms/IROutliner/extraction.ll
--- a/llvm/test/Transforms/IROutliner/extraction.ll
+++ b/llvm/test/Transforms/IROutliner/extraction.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -32,7 +32,7 @@
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -53,19 +53,23 @@
 define void @extract_outs1() #0 {
 ; CHECK-LABEL: @extract_outs1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
-; CHECK-NEXT:    store i32 3, i32* [[B]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[TMP2]], i32 [[ADD]], i32* [[RESULT]])
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    call void @outlined_ir_func_2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -91,18 +95,22 @@
 define void @extract_outs2() #0 {
 ; CHECK-LABEL: @extract_outs2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
-; CHECK-NEXT:    store i32 3, i32* [[B]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[TMP2]], i32 [[ADD]], i32* [[RESULT]])
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 1)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/IROutliner/illegal-assumes.ll b/llvm/test/Transforms/IROutliner/illegal-assumes.ll
--- a/llvm/test/Transforms/IROutliner/illegal-assumes.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-assumes.ll
@@ -7,15 +7,19 @@
 define void @outline_assumes() {
 ; CHECK-LABEL: @outline_assumes(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DL_LOC:%.*]] = alloca i1, align 1
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[D:%.*]] = alloca i1, align 4
-; CHECK-NEXT:    store i1 true, i1* [[D]], align 4
-; CHECK-NEXT:    [[DL:%.*]] = load i1, i1* [[D]], align 1
-; CHECK-NEXT:    [[SPLIT_INST:%.*]] = sub i1 [[DL]], [[DL]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i1* [[DL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_3(i1 true, i1* [[D]], i1* [[DL_LOC]], i32 0)
+; CHECK-NEXT:    [[DL_RELOAD:%.*]] = load i1, i1* [[DL_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[SPLIT_INST:%.*]] = sub i1 [[DL_RELOAD]], [[DL_RELOAD]]
 ; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
-; CHECK-NEXT:    call void @llvm.assume(i1 [[DL]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[DL_RELOAD]])
 ; CHECK-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]])
 ; CHECK-NEXT:    ret void
 ;
@@ -40,14 +44,18 @@
 define void @outline_assumes2() {
 ; CHECK-LABEL: @outline_assumes2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DL_LOC:%.*]] = alloca i1, align 1
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[D:%.*]] = alloca i1, align 4
-; CHECK-NEXT:    store i1 false, i1* [[D]], align 4
-; CHECK-NEXT:    [[DL:%.*]] = load i1, i1* [[D]], align 1
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i1* [[DL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_3(i1 false, i1* [[D]], i1* [[DL_LOC]], i32 1)
+; CHECK-NEXT:    [[DL_RELOAD:%.*]] = load i1, i1* [[DL_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
 ; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
-; CHECK-NEXT:    call void @llvm.assume(i1 [[DL]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[DL_RELOAD]])
 ; CHECK-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/IROutliner/illegal-memcpy.ll b/llvm/test/Transforms/IROutliner/illegal-memcpy.ll
--- a/llvm/test/Transforms/IROutliner/illegal-memcpy.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-memcpy.ll
@@ -9,12 +9,22 @@
 define i8 @function1(i8* noalias %s, i8* noalias %d, i64 %len) {
 ; CHECK-LABEL: @function1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[S:%.*]], align 1
-; CHECK-NEXT:    [[B:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[B_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[A_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[RET_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 0)
+; CHECK-NEXT:    [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1
+; CHECK-NEXT:    [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false)
-; CHECK-NEXT:    [[C:%.*]] = add i8 [[A]], [[B]]
-; CHECK-NEXT:    [[RET:%.*]] = load i8, i8* [[S]], align 1
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 0)
+; CHECK-NEXT:    [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    ret i8 [[RET_RELOAD]]
 ;
 entry:
   %a = load i8, i8* %s
@@ -28,12 +38,22 @@
 define i8 @function2(i8* noalias %s, i8* noalias %d, i64 %len) {
 ; CHECK-LABEL: @function2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[S:%.*]], align 1
-; CHECK-NEXT:    [[B:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[B_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[A_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[RET_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 1)
+; CHECK-NEXT:    [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1
+; CHECK-NEXT:    [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false)
-; CHECK-NEXT:    [[C:%.*]] = add i8 [[A]], [[B]]
-; CHECK-NEXT:    [[RET:%.*]] = load i8, i8* [[S]], align 1
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 1)
+; CHECK-NEXT:    [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    ret i8 [[RET_RELOAD]]
 ;
 entry:
   %a = load i8, i8* %s
diff --git a/llvm/test/Transforms/IROutliner/illegal-memmove.ll b/llvm/test/Transforms/IROutliner/illegal-memmove.ll
--- a/llvm/test/Transforms/IROutliner/illegal-memmove.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-memmove.ll
@@ -9,12 +9,22 @@
 define i8 @function1(i8* noalias %s, i8* noalias %d, i64 %len) {
 ; CHECK-LABEL: @function1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[S:%.*]], align 1
-; CHECK-NEXT:    [[B:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[B_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[A_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[RET_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 0)
+; CHECK-NEXT:    [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1
+; CHECK-NEXT:    [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]])
 ; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false)
-; CHECK-NEXT:    [[C:%.*]] = add i8 [[A]], [[B]]
-; CHECK-NEXT:    [[RET:%.*]] = load i8, i8* [[S]], align 1
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 0)
+; CHECK-NEXT:    [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    ret i8 [[RET_RELOAD]]
 ;
 entry:
   %a = load i8, i8* %s
@@ -28,12 +38,22 @@
 define i8 @function2(i8* noalias %s, i8* noalias %d, i64 %len) {
 ; CHECK-LABEL: @function2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[S:%.*]], align 1
-; CHECK-NEXT:    [[B:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[B_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[A_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[RET_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 1)
+; CHECK-NEXT:    [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1
+; CHECK-NEXT:    [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]])
 ; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false)
-; CHECK-NEXT:    [[C:%.*]] = add i8 [[A]], [[B]]
-; CHECK-NEXT:    [[RET:%.*]] = load i8, i8* [[S]], align 1
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 1)
+; CHECK-NEXT:    [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    ret i8 [[RET_RELOAD]]
 ;
 entry:
   %a = load i8, i8* %s
diff --git a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
--- a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
@@ -11,17 +11,20 @@
 define i32 @func1(i32 %a, double %b, i8* %v, ...) nounwind {
 ; CHECK-LABEL: @func1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AP1_LOC:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
 ; CHECK-NEXT:    [[AP:%.*]] = alloca i8*, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-; CHECK-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
-; CHECK-NEXT:    [[AP1:%.*]] = bitcast i8** [[AP]] to i8*
-; CHECK-NEXT:    call void @llvm.va_start(i8* [[AP1]])
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i8** [[AP1_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[A:%.*]], i32* [[A_ADDR]], double [[B:%.*]], double* [[B_ADDR]], i8** [[AP]], i8** [[AP1_LOC]], i32 0)
+; CHECK-NEXT:    [[AP1_RELOAD:%.*]] = load i8*, i8** [[AP1_LOC]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.va_start(i8* [[AP1_RELOAD]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg i8** [[AP]], i32
-; CHECK-NEXT:    call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1]])
-; CHECK-NEXT:    call void @llvm.va_end(i8* [[AP1]])
+; CHECK-NEXT:    call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1_RELOAD]])
+; CHECK-NEXT:    call void @llvm.va_end(i8* [[AP1_RELOAD]])
 ; CHECK-NEXT:    store i32 [[TMP0]], i32* [[C]], align 4
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP]]
@@ -46,17 +49,20 @@
 define i32 @func2(i32 %a, double %b, i8* %v, ...) nounwind {
 ; CHECK-LABEL: @func2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AP1_LOC:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
 ; CHECK-NEXT:    [[AP:%.*]] = alloca i8*, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-; CHECK-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
-; CHECK-NEXT:    [[AP1:%.*]] = bitcast i8** [[AP]] to i8*
-; CHECK-NEXT:    call void @llvm.va_start(i8* [[AP1]])
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i8** [[AP1_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[A:%.*]], i32* [[A_ADDR]], double [[B:%.*]], double* [[B_ADDR]], i8** [[AP]], i8** [[AP1_LOC]], i32 1)
+; CHECK-NEXT:    [[AP1_RELOAD:%.*]] = load i8*, i8** [[AP1_LOC]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.va_start(i8* [[AP1_RELOAD]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg i8** [[AP]], i32
-; CHECK-NEXT:    call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1]])
-; CHECK-NEXT:    call void @llvm.va_end(i8* [[AP1]])
+; CHECK-NEXT:    call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1_RELOAD]])
+; CHECK-NEXT:    call void @llvm.va_end(i8* [[AP1_RELOAD]])
 ; CHECK-NEXT:    store i32 [[TMP0]], i32* [[C]], align 4
 ; CHECK-NEXT:    [[AP2:%.*]] = bitcast i8** [[AP]] to i8*
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[C]], align 4
diff --git a/llvm/test/Transforms/IROutliner/outlining-different-output-blocks.ll b/llvm/test/Transforms/IROutliner/outlining-different-output-blocks.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-different-output-blocks.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -iroutliner < %s | FileCheck %s
+
+; These functions are constructed slightly differently so that they require
+; different output blocks for the values used outside of the region. We are
+; checking that two output blocks are created with different values.
+
+define void @outline_outputs1() #0 {
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  %sub = sub i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %3 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[SUB_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[SUB_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[SUB_LOC]], i32* [[DOTLOC]], i32 1)
+; CHECK-NEXT:    [[SUB_RELOAD:%.*]] = load i32, i32* [[SUB_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[SUB_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  %sub = sub i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %sub
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+; CHECK: define internal void @outlined_ir_func_0(i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]], i32* [[ARG3:%.*]], i32* [[ARG4:%.*]], i32 [[ARG5:%.*]]) #1 {
+; CHECK: _after_outline.exitStub:
+; CHECK-NEXT:    switch i32 [[ARG5]], label [[BLOCK:%.*]] [
+; CHECK-NEXT:      i32 0, label %[[BLOCK_0:.*]]
+; CHECK-NEXT:      i32 1, label %[[BLOCK_1:.*]]
+
+; CHECK: entry_to_outline:
+; CHECK-NEXT:    store i32 2, i32* [[ARG0]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARG0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARG2]], align 4
+
+; CHECK: [[BLOCK_0]]:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG3]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARG4]], align 4
+
+; CHECK: [[BLOCK_1]]:
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[ARG3]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARG4]], align 4
diff --git a/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll b/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -iroutliner < %s | FileCheck %s
+
+; This test tests that inputs that are replaced with the output of an outlined
+; function is still recognized as the same value.
+
+define void @outline_outputs1() #0 {
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD2_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32 2, i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    [[LT_CAST4:%.*]] = bitcast i32* [[ADD2_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST4]])
+; CHECK-NEXT:    [[LT_CAST5:%.*]] = bitcast i32* [[DOTLOC2]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST5]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[ADD_RELOAD]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[OUTPUT2]], i32* [[ADD2_LOC]], i32* [[DOTLOC2]], i32 1)
+; CHECK-NEXT:    [[ADD2_RELOAD:%.*]] = load i32, i32* [[ADD2_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD3:%.*]] = load i32, i32* [[DOTLOC2]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST4]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST5]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD3]], i32 [[ADD2_RELOAD]], i32* [[RESULT2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %3 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  br label %next
+next:
+  store i32 %add, i32* %output, align 4
+  store i32 3, i32* %result, align 4
+  %4 = load i32, i32* %output, align 4
+  %5 = load i32, i32* %result, align 4
+  %add2 = add i32 %4, %5
+  store i32 %add2, i32* %output2, align 4
+  %6 = load i32, i32* %output2, align 4
+  %mul2 = mul i32 %6, %add2
+  store i32 %mul2, i32* %result2, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll b/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -iroutliner < %s | FileCheck %s
+
+; These functions are constructed slightly differently so that they require
+; the same output blocks for the values used outside of the region. We are
+; checking that two output blocks are created with the same store instructions.
+
+define void @outline_outputs1() #0 {
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %3 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 1)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+; CHECK: define internal void @outlined_ir_func_0(i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]], i32* [[ARG3:%.*]], i32* [[ARG4:%.*]], i32 [[ARG5:%.*]]) #1 {
+; CHECK: entry_after_outline.exitStub:
+; CHECK-NEXT:    switch i32 [[ARG5]], label [[BLOCK:%.*]] [
+; CHECK-NEXT:      i32 0, label %[[BLOCK_0:.*]]
+; CHECK-NEXT:      i32 1, label %[[BLOCK_1:.*]]
+
+; CHECK: entry_to_outline:
+; CHECK-NEXT:    store i32 2, i32* [[ARG0]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARG0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARG2]], align 4
+
+; CHECK: [[BLOCK_0]]:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG3]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARG4]], align 4
+
+; CHECK: [[BLOCK_1]]:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG3]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARG4]], align 4