diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -334,6 +334,7 @@
 void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&);
 void initializeOptimizePHIsPass(PassRegistry&);
 void initializePAEvalPass(PassRegistry&);
+void initializePartialMemToRegLegacyPassPass(PassRegistry &);
 void initializePEIPass(PassRegistry&);
 void initializePGOIndirectCallPromotionLegacyPassPass(PassRegistry&);
 void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -113,6 +113,13 @@
 //
 FunctionPass *createSROAPass();
 
+//===----------------------------------------------------------------------===//
+//
+// PartialMemToReg - Converts alloca uses into phi nodes until the address
+// is (potentially) captured.
+//
+FunctionPass *createPartialMemToRegPass();
+
 //===----------------------------------------------------------------------===//
 //
 // InductiveRangeCheckElimination - Transform loops to elide range checks on
diff --git a/llvm/include/llvm/Transforms/Scalar/PartialMemToReg.h b/llvm/include/llvm/Transforms/Scalar/PartialMemToReg.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/PartialMemToReg.h
@@ -0,0 +1,53 @@
+//===- PartialMemToReg.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Provides a pass which runs a partial mem2reg operation on allocas which
+/// are deemed to be captured at some point but are used extensively
+/// beforehand.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_PARTIALMEMTOREG_H
+#define LLVM_TRANSFORMS_SCALAR_PARTIALMEMTOREG_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ValueHandle.h"
+#include <vector>
+
+namespace llvm {
+
+class AssumptionCache;
+class DominatorTree;
+class Function;
+class LoopInfo;
+class PostDominatorTree;
+
+class PartialMemToRegLegacyPass;
+
+class PartialMemToReg : public PassInfoMixin<PartialMemToReg> {
+
+public:
+  PartialMemToReg() = default;
+
+  /// Run the pass over the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  friend class PartialMemToRegLegacyPass;
+
+  /// Helper used by both the public run method and by the legacy pass.
+  PreservedAnalyses runImpl(Function &F, DominatorTree &DT,
+                            PostDominatorTree &PDT, AssumptionCache &AC,
+                            LoopInfo &LI);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_PARTIALMEMTOREG_H
diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
--- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -18,7 +18,9 @@
 
 template <typename T> class ArrayRef;
 class AllocaInst;
+class LoopInfo;
 class DominatorTree;
+class PostDominatorTree;
 class AssumptionCache;
 
 /// Return true if this alloca is legal for promotion.
@@ -27,7 +29,7 @@
 /// (transitively) using this alloca. This also enforces that there is only
 /// ever one layer of bitcasts or GEPs between the alloca and the lifetime
 /// markers.
-bool isAllocaPromotable(const AllocaInst *AI);
+bool isAllocaPromotable(const AllocaInst *AI, bool AllowCaptures = false);
 
 /// Promote the specified list of alloca instructions into scalar
 /// registers, inserting PHI nodes as appropriate.
@@ -39,6 +41,9 @@
 void PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
                      AssumptionCache *AC = nullptr);
 
+bool partialPromoteMemToReg(ArrayRef<AllocaInst *> Allocas, LoopInfo &LI,
+                            DominatorTree &DT, PostDominatorTree &PDT,
+                            AssumptionCache *AC = nullptr);
 } // End llvm namespace
 
 #endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -193,6 +193,7 @@
 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/Transforms/Scalar/NaryReassociate.h"
 #include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/Transforms/Scalar/PartialMemToReg.h"
 #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
 #include "llvm/Transforms/Scalar/Reassociate.h"
 #include "llvm/Transforms/Scalar/Reg2Mem.h"
@@ -279,6 +280,10 @@
     "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden,
     cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3"));
 
+static cl::opt<bool> EnablePartialMemToReg(
+    "enable-partial-mem2reg", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Enable partial mem2reg SSA transformation before captures."));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -816,6 +821,10 @@
   // Delete small array after loop unroll.
   FPM.addPass(SROA());
 
+  // Partially promote some captured allocas to SSA form.
+  if (EnablePartialMemToReg)
+    FPM.addPass(PartialMemToReg());
+
   // Eliminate redundancies.
   FPM.addPass(MergedLoadStoreMotionPass());
   if (RunNewGVN)
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -279,6 +279,7 @@
 FUNCTION_PASS("objc-arc", ObjCARCOptPass())
 FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass())
 FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass())
+FUNCTION_PASS("partial-mem2reg", PartialMemToReg())
 FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -60,6 +60,7 @@
   NaryReassociate.cpp
   NewGVN.cpp
   PartiallyInlineLibCalls.cpp
+  PartialMemToReg.cpp
   PlaceSafepoints.cpp
   Reassociate.cpp
   Reg2Mem.cpp
diff --git a/llvm/lib/Transforms/Scalar/PartialMemToReg.cpp b/llvm/lib/Transforms/Scalar/PartialMemToReg.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/PartialMemToReg.cpp
@@ -0,0 +1,130 @@
+//===- PartialMemToReg.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Provides a pass which runs a partial mem2reg operation on allocas which
+/// are deemed to be captured at some point but are used extensively
+/// beforehand.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/PartialMemToReg.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partial-mem2reg"
+
+namespace llvm {
+
+PreservedAnalyses PartialMemToReg::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+
+  return runImpl(F, DT, PDT, AC, LI);
+}
+
+PreservedAnalyses PartialMemToReg::runImpl(Function &F, DominatorTree &DT,
+                                           PostDominatorTree &PDT,
+                                           AssumptionCache &AC, LoopInfo &LI) {
+  LLVM_DEBUG(dbgs() << "PartialMem2Reg on: " << F.getName() << "\n");
+  SmallVector<AllocaInst *, 16> Worklist;
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (Instruction &I : EntryBB)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+      if (!isa<ScalableVectorType>(AI->getAllocatedType()))
+        Worklist.push_back(AI);
+
+  if (!partialPromoteMemToReg(Worklist, LI, DT, PDT))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+class PartialMemToRegLegacyPass : public FunctionPass {
+  PartialMemToReg Impl;
+
+public:
+  static char ID;
+
+  PartialMemToRegLegacyPass() : FunctionPass(ID) {
+    initializePartialMemToRegLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto PA = Impl.runImpl(
+        F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(),
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+        getAnalysis<LoopInfoWrapperPass>().getLoopInfo());
+
+    return !PA.areAllPreserved();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override { return "PartialMemToReg"; }
+};
+} // end namespace llvm
+
+char llvm::PartialMemToRegLegacyPass::ID = 0;
+
+FunctionPass *llvm::createPartialMemToRegPass() {
+  return new PartialMemToRegLegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(PartialMemToRegLegacyPass, "partial-mem2reg",
+                      "PartialMemToReg", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(PartialMemToRegLegacyPass, "partial-mem2reg",
+                    "PartialMemToReg", false, false)
\ No newline at end of file
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -92,6 +92,7 @@
   initializeMergedLoadStoreMotionLegacyPassPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
+  initializePartialMemToRegLegacyPassPass(Registry);
   initializeReassociateLegacyPassPass(Registry);
   initializeRedundantDbgInstEliminationPass(Registry);
   initializeRegToMemLegacyPass(Registry);
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -25,7 +25,8 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -45,6 +46,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
@@ -61,7 +63,7 @@
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
 
-bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+bool llvm::isAllocaPromotable(const AllocaInst *AI, bool AllowCaptures) {
   // Only allow direct and non-volatile loads and stores...
   for (const User *U : AI->users()) {
     if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
@@ -70,9 +72,12 @@
       if (LI->isVolatile())
         return false;
     } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      if (SI->getValueOperand() == AI ||
-          SI->getValueOperand()->getType() != AI->getAllocatedType())
-        return false; // Don't allow a store OF the AI, only INTO the AI.
+      // Don't allow a store OF the AI, only INTO the AI, unless we're
+      // looking for captures during partialmem2reg.
+      if ((SI->getValueOperand() == AI ||
+           SI->getValueOperand()->getType() != AI->getAllocatedType()) &&
+          !AllowCaptures)
+        return false;
       // Note that atomic stores can be transformed; atomic semantics do
       // not have any meaning for a local alloca.
       if (SI->isVolatile())
@@ -124,7 +129,9 @@
 
   /// Scan the uses of the specified alloca, filling in the AllocaInfo used
   /// by the rest of the pass to reason about the uses of this alloca.
-  void AnalyzeAlloca(AllocaInst *AI) {
+  void AnalyzeAlloca(AllocaInst *AI, bool Capturing = false,
+                     PostDominatorTree *PDT = nullptr,
+                     StoreInst *Capture = nullptr) {
     clear();
 
     // As we scan the uses of the alloca instruction, keep track of stores,
@@ -133,16 +140,28 @@
     for (User *U : AI->users()) {
       Instruction *User = cast<Instruction>(U);
 
+      // If we're analyzing a captured alloca, only consider the users that
+      // are postdominated by the capture. Anything after the capture or
+      // in a block which may bypass the capture should not be converted
+      // to SSA form.
+      if (Capturing && !PDT->dominates(Capture, User))
+        continue;
+
       if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
         // Remember the basic blocks which define new values for the alloca
-        DefiningBlocks.push_back(SI->getParent());
-        OnlyStore = SI;
-      } else {
-        LoadInst *LI = cast<LoadInst>(User);
+        if (SI->getOperand(0) == AI) {
+          assert(Capturing && "Unexpected capture for non-captured alloca.");
+          UsingBlocks.push_back(SI->getParent());
+        } else {
+          DefiningBlocks.push_back(SI->getParent());
+          OnlyStore = SI;
+        }
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
         // Otherwise it must be a load instruction, keep track of variable
         // reads.
         UsingBlocks.push_back(LI->getParent());
-      }
+      } else
+        assert(Capturing && "Unexpected user for non-captured alloca.");
 
       if (OnlyUsedInOneBlock) {
         if (!OnlyBlock)
@@ -243,6 +262,10 @@
   /// (BBNumbers), the DenseMap is more efficient (also supports removal).
   DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes;
 
+  DenseMap<unsigned, StoreInst *> NewPreCaptureStores;
+
+  DenseMap<unsigned, Instruction *> Captures;
+
   /// For each PHI node, keep track of which entry in Allocas it corresponds
   /// to.
   DenseMap<PHINode *, unsigned> PhiToAllocaMap;
@@ -271,6 +294,7 @@
                    nullptr, &DT, AC) {}
 
   void run();
+  bool runPartial(PostDominatorTree &PDT, LoopInfo &LI);
 
 private:
   void RemoveFromAllocasList(unsigned &AllocaIdx) {
@@ -294,6 +318,7 @@
                   RenamePassData::LocationVector &IncLocs,
                   std::vector<RenamePassData> &Worklist);
   bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
+  bool QueuePreCaptureStore(BasicBlock *BB, unsigned AllocaIdx);
 };
 
 } // end anonymous namespace
@@ -543,6 +568,311 @@
   return true;
 }
 
+bool PromoteMem2Reg::runPartial(PostDominatorTree &PDT, LoopInfo &LI) {
+  Function &F = *DT.getRoot()->getParent();
+  AllocaDbgUsers.resize(Allocas.size());
+  AllocaInfo Info;
+  LargeBlockInfo LBI;
+  ForwardIDFCalculator IDF(DT);
+
+  // Create a stable numbering for basic blocks to avoid any non-deterministic
+  // behaviour with ordering.
+  if (BBNumbers.empty()) {
+    unsigned ID = 0;
+    for (auto &BB : F)
+      BBNumbers[&BB] = ID++;
+  }
+
+  for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
+    AllocaInst *AI = Allocas[AllocaNum];
+    StoreInst *Capture = nullptr;
+    LLVM_DEBUG(dbgs() << "PM2R: Analyzing alloca: " << *AI << "\n");
+
+    if (!isAllocaPromotable(AI, /*AllowCaptures=*/true)) {
+      LLVM_DEBUG(dbgs() << "PM2R: Unhandled uses.\n");
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    // For now, be a bit paranoid and only consider allocas with a single
+    // capture user, and only stores as a capture.
+    // TODO: Allow for multiple captures and captures in call instructions.
+    bool MultiCapture = false;
+    for (User *U : AI->users()) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(U))
+        if (SI->getValueOperand() == AI) {
+          if (!Capture)
+            Capture = SI;
+          else
+            MultiCapture = true;
+        }
+    }
+
+    if (!Capture || MultiCapture) {
+      LLVM_DEBUG(dbgs() << "PM2R: No capture or multiple captures.\n");
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    Captures[AllocaNum] = Capture;
+
+    // Find the set of users that are postdominated by the capture.
+    SmallVector<Instruction *, 8> PreCaptureUsers;
+    for (User *U : AI->users()) {
+      Instruction *I = cast<Instruction>(U);
+      // Ignore the capture itself.
+      if (Capture == I)
+        continue;
+
+      // We only consider users that are postdominated by the capture -- that
+      // is, we _know_ that the capture will definitely be executed after the
+      // user. For users that are executed after the capture, or users where
+      // the subsequent execution path might not go through the block containing
+      // the capture, we don't want to convert right now.
+      if (PDT.dominates(Capture, I))
+        PreCaptureUsers.push_back(I);
+    }
+
+    // If there are no users postdominated by the capture, we won't try this
+    // since anything after the capture could be reached without storing
+    // the value into the alloca location.
+    // TODO: We should be able to identify all blocks which need to store
+    // the value into memory before entering code that may follow a capture.
+    if (PreCaptureUsers.empty()) {
+      LLVM_DEBUG(dbgs() << "PM2R: No users postdominated by capture.\n");
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    // For now, only perform the partial conversion if some of the uses are
+    // present in a loop -- while it may be worthwhile to do this anyway,
+    // we're currently interested in enabling loop transformations that would
+    // otherwise be prevented by the presence of loads/stores to the alloca
+    // within the loop.
+    bool UsedInLoop = false;
+    for (Loop *L : LI)
+      UsedInLoop |= any_of(PreCaptureUsers,
+                           [&L](Instruction *I) { return L->contains(I); });
+    if (!UsedInLoop) {
+      LLVM_DEBUG(dbgs() << "PM2R: No users in loops.\n");
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    // Determine which blocks define, use, and/or capture the alloca.
+    Info.AnalyzeAlloca(AI, /* Capturing == */ true, &PDT, Capture);
+
+    // Unique the set of defining blocks for efficient lookup.
+    SmallPtrSet<BasicBlock *, 32> DefBlocks(Info.DefiningBlocks.begin(),
+                                            Info.DefiningBlocks.end());
+
+    // Determine which blocks the value is live in.  These are blocks which lead
+    // to uses.
+    SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+    ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+    if (!all_of(LiveInBlocks, [&PDT, &Capture](BasicBlock *BB) {
+          return PDT.dominates(Capture->getParent(), BB);
+        })) {
+      LLVM_DEBUG(
+          dbgs()
+          << "PM2R: not all live blocks are postdominated by capture.\n");
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    // Ok, we've passed all our criteria for partially promoting an alloca.
+    // Proceed with figuring out what to do with it but throw in a few extra
+    // checks out of an abundance of caution.
+
+    // Calculate dominance frontiers so we know where to plant phi nodes for
+    // SSA conversion.
+    IDF.setLiveInBlocks(LiveInBlocks);
+    IDF.setDefiningBlocks(DefBlocks);
+    SmallVector<BasicBlock *, 32> PHIBlocks;
+    IDF.calculate(PHIBlocks);
+    llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) {
+      return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
+    });
+
+    if (PHIBlocks.empty()) {
+      LLVM_DEBUG(dbgs() << "PM2R: could not identify a usable phi block\n");
+      continue;
+    }
+
+    // Only proceed if all phi blocks are postdominated by the capture.
+    // Maybe this should be an assert?
+    if (!all_of(PHIBlocks, [&PDT, &Capture](BasicBlock *BB) {
+          return PDT.dominates(Capture->getParent(), BB);
+        })) {
+      LLVM_DEBUG(
+          dbgs() << "PM2R: not all phi blocks are postdominated by capture.\n");
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    // We may want to find a better way of doing this in future, but for
+    // now just be paranoid and sort the blocks by postdomination order
+    // before adding the store to ensure the correct value is in place
+    // for the capture.
+    // TODO: Add support for phis on diverging paths that are still
+    // postdominated by the capture.
+    SmallVector<BasicBlock *, 32> PDOrderBlocks(PHIBlocks.begin(),
+                                                PHIBlocks.end());
+    llvm::sort(PDOrderBlocks, [&PDT](BasicBlock *A, BasicBlock *B) {
+      return PDT.dominates(B, A);
+    });
+    BasicBlock *DefBlock = PDOrderBlocks.back();
+
+    // Maybe this should be an assert?
+    if (!all_of(PDOrderBlocks, [&PDT, &DefBlock](BasicBlock *BB) {
+          return PDT.dominates(DefBlock, BB);
+        })) {
+      LLVM_DEBUG(dbgs() << "PM2R: definition block for capture doesn't "
+                           "postdominate all other phi blocks.");
+      RemoveFromAllocasList(AllocaNum);
+      continue;
+    }
+
+    // Point of no return; we're making changes to the IR now.
+
+    // Remember the dbg.declare intrinsic describing this alloca, if any.
+    if (!Info.DbgUsers.empty())
+      AllocaDbgUsers[AllocaNum] = Info.DbgUsers;
+
+    LLVM_DEBUG(dbgs() << "PM2R: Partially promoting alloca: " << *AI << "\n");
+    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+    unsigned CurrentVersion = 0;
+    for (BasicBlock *BB : PHIBlocks)
+      QueuePhiNode(BB, AllocaNum, CurrentVersion);
+
+    // Create the store that will set the allocated memory to the right value
+    // before the capture occurs.
+    QueuePreCaptureStore(DefBlock, AllocaNum);
+  }
+
+  if (Allocas.empty())
+    return false;
+
+  RenamePassData::ValVector Values(Allocas.size());
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+    Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
+
+  // When handling debug info, treat all incoming values as if they have unknown
+  // locations until proven otherwise.
+  RenamePassData::LocationVector Locations(Allocas.size());
+
+  // Walks all basic blocks in the function performing the SSA rename algorithm
+  // and inserting the phi nodes we marked as necessary
+  std::vector<RenamePassData> RenamePassWorkList;
+  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values),
+                                  std::move(Locations));
+  do {
+    RenamePassData RPD = std::move(RenamePassWorkList.back());
+    RenamePassWorkList.pop_back();
+    // RenamePass may add new worklist entries.
+    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList);
+  } while (!RenamePassWorkList.empty());
+
+  // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
+  Visited.clear();
+
+  // Loop over all of the PHI nodes and see if there are any that we can get
+  // rid of because they merge all of the same incoming values.  This can
+  // happen due to undef values coming into the PHI nodes.  This process is
+  // iterative, because eliminating one PHI node can cause others to be removed.
+  bool EliminatedAPHI = true;
+  while (EliminatedAPHI) {
+    EliminatedAPHI = false;
+
+    // Iterating over NewPhiNodes is deterministic, so it is safe to try to
+    // simplify and RAUW them as we go.  If it was not, we could add uses to
+    // the values we replace with in a non-deterministic order, thus creating
+    // non-deterministic def->use chains.
+    for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+             I = NewPhiNodes.begin(),
+             E = NewPhiNodes.end();
+         I != E;) {
+      PHINode *PN = I->second;
+
+      // If this PHI node merges one value and/or undefs, get the value.
+      if (Value *V = SimplifyInstruction(PN, SQ)) {
+        PN->replaceAllUsesWith(V);
+        PN->eraseFromParent();
+        NewPhiNodes.erase(I++);
+        EliminatedAPHI = true;
+        continue;
+      }
+      ++I;
+    }
+  }
+
+  // At this point, the renamer has added entries to PHI nodes for all reachable
+  // code.  Unfortunately, there may be unreachable blocks which the renamer
+  // hasn't traversed.  If this is the case, the PHI nodes may not
+  // have incoming values for all predecessors.  Loop over all PHI nodes we have
+  // created, inserting undef values if they are missing any incoming values.
+  for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+           I = NewPhiNodes.begin(),
+           E = NewPhiNodes.end();
+       I != E; ++I) {
+    // We want to do this once per basic block.  As such, only process a block
+    // when we find the PHI that is the first entry in the block.
+    PHINode *SomePHI = I->second;
+    BasicBlock *BB = SomePHI->getParent();
+    if (&BB->front() != SomePHI)
+      continue;
+
+    // Only do work here if there the PHI nodes are missing incoming values.  We
+    // know that all PHI nodes that were inserted in a block will have the same
+    // number of incoming values, so we can just check any of them.
+    if (SomePHI->getNumIncomingValues() == getNumPreds(BB))
+      continue;
+
+    // Get the preds for BB.
+    SmallVector<BasicBlock *, 16> Preds(predecessors(BB));
+
+    // Ok, now we know that all of the PHI nodes are missing entries for some
+    // basic blocks.  Start by sorting the incoming predecessors for efficient
+    // access.
+    auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) {
+      return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
+    };
+    llvm::sort(Preds, CompareBBNumbers);
+
+    // Now we loop through all BB's which have entries in SomePHI and remove
+    // them from the Preds list.
+    for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
+      // Do a log(n) search of the Preds list for the entry we want.
+      SmallVectorImpl<BasicBlock *>::iterator EntIt = llvm::lower_bound(
+          Preds, SomePHI->getIncomingBlock(i), CompareBBNumbers);
+      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) &&
+             "PHI node has entry for a block which is not a predecessor!");
+
+      // Remove the entry
+      Preds.erase(EntIt);
+    }
+
+    // At this point, the blocks left in the preds list must have dummy
+    // entries inserted into every PHI nodes for the block.  Update all the phi
+    // nodes in this block that we are inserting (there could be phis before
+    // mem2reg runs).
+    unsigned NumBadPreds = SomePHI->getNumIncomingValues();
+    BasicBlock::iterator BBI = BB->begin();
+    while ((SomePHI = dyn_cast<PHINode>(BBI++)) &&
+           SomePHI->getNumIncomingValues() == NumBadPreds) {
+      Value *UndefVal = UndefValue::get(SomePHI->getType());
+      for (BasicBlock *Pred : Preds)
+        SomePHI->addIncoming(UndefVal, Pred);
+    }
+  }
+
+  NewPhiNodes.clear();
+
+  return true;
+}
+
 void PromoteMem2Reg::run() {
   Function &F = *DT.getRoot()->getParent();
 
@@ -870,6 +1200,23 @@
   return true;
 }
 
+/// Adds a store after the last phi node before a capturing store, so that
+/// the value is up-to-date before the capture of the alloca.
+bool PromoteMem2Reg::QueuePreCaptureStore(BasicBlock *BB, unsigned AllocaIdx) {
+  StoreInst *&SI = NewPreCaptureStores[AllocaIdx];
+  PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaIdx)];
+  AllocaInst *AI = Allocas[AllocaIdx];
+
+  if (SI)
+    return false;
+
+  assert(PN && "No Phi node available for capture!\n");
+
+  Instruction *InsertBefore = BB->getFirstNonPHI();
+  SI = new StoreInst(PN, AI, InsertBefore);
+  return true;
+}
+
 /// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to
 /// create a merged location incorporating \p DL, or to set \p DL directly.
 static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL,
@@ -954,7 +1301,14 @@
       if (AI == AllocaLookup.end())
         continue;
 
-      Value *V = IncomingVals[AI->second];
+      unsigned AllocaNo = AI->second;
+
+      // If this is a load after the capture (but potentially in the same block)
+      // then we must not convert it.
+      if (Captures[AllocaNo] && DT.dominates(Captures[AllocaNo], LI))
+        continue;
+
+      Value *V = IncomingVals[AllocaNo];
 
       // If the load was marked as nonnull we don't want to lose
       // that information when we erase this Load. So we preserve
@@ -973,17 +1327,23 @@
       if (!Dest)
         continue;
 
-      DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
-      if (ai == AllocaLookup.end())
+      DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Dest);
+      if (AI == AllocaLookup.end())
+        continue;
+
+      unsigned AllocaNo = AI->second;
+      StoreInst *&PCS = NewPreCaptureStores[AllocaNo];
+
+      // If this is for a capture, then we don't want to remove it.
+      if (PCS && SI == PCS)
         continue;
 
       // what value were we writing?
-      unsigned AllocaNo = ai->second;
       IncomingVals[AllocaNo] = SI->getOperand(0);
 
       // Record debuginfo for the store before removing it.
       IncomingLocs[AllocaNo] = SI->getDebugLoc();
-      for (DbgVariableIntrinsic *DII : AllocaDbgUsers[ai->second])
+      for (DbgVariableIntrinsic *DII : AllocaDbgUsers[AllocaNo])
         if (DII->isAddressOfVariable())
           ConvertDebugDeclareToDebugValue(DII, SI, DIB);
       BB->getInstList().erase(SI);
@@ -1019,3 +1379,9 @@
 
   PromoteMem2Reg(Allocas, DT, AC).run();
 }
+
+bool llvm::partialPromoteMemToReg(ArrayRef<AllocaInst *> Allocas, LoopInfo &LI,
+                                  DominatorTree &DT, PostDominatorTree &PDT,
+                                  AssumptionCache *AC) {
+  return PromoteMem2Reg(Allocas, DT, AC).runPartial(PDT, LI);
+}
\ No newline at end of file
diff --git a/llvm/test/Transforms/Mem2Reg/partial-mem2reg.ll b/llvm/test/Transforms/Mem2Reg/partial-mem2reg.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/Mem2Reg/partial-mem2reg.ll
@@ -0,0 +1,348 @@
+; RUN: opt -partial-mem2reg -debug-only=partial-mem2reg,mem2reg -S < %s 2>&1 | FileCheck %s --check-prefix=DEBUG
+; RUN: opt -partial-mem2reg -S < %s 2>&1 | FileCheck %s --check-prefix=XFORM
+; RUN: opt -partial-mem2reg -gvn -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=VDEBUG
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+;; Tests based on the following C code, simplified. The OpenMP codegen from
+;; clang created an outlined function where the alloca for the reduction
+;; was captured by a call to the runtime and couldn't be promoted to register
+;; SSA form. This prevented vectorization.
+;;
+;; We can now vectorize this by partially promoting the alloca, just converting
+;; definitions and uses that are postdominated by the capture. This is a
+;; conservative first attempt at this optimization.
+;;
+;; int loop(int data[restrict 128U])
+;; {
+;;   int retval = 0;
+;;
+;; #pragma omp parallel for simd schedule(simd:static) default(none) shared(data) reduction(+:retval)
+;;   for (int i = 0; i < 128; i++) {
+;;     int n = 0;
+;;
+;;     if (data[i]) {
+;;       n = 1;
+;;       retval += n;
+;;     }
+;;   }
+;;   return retval;
+;; }
+
+; DEBUG-LABEL: PartialMem2Reg on: captured_reduction
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %retval = alloca i32, align 4
+; DEBUG-NEXT: PM2R: Partially promoting alloca:   %retval = alloca i32, align 4
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %red_list = alloca i32*, align 8
+; DEBUG-NEXT: PM2R: Unhandled uses.
+
+; VDEBUG-LABEL: LV: Checking a loop in "captured_reduction" from <stdin>
+; VDEBUG: LV: We can vectorize this loop!
+
+; XFORM-LABEL: @captured_reduction
+define i32 @captured_reduction(i32* nocapture nonnull readonly %data, i32 %n) {
+entry:
+  %retval = alloca i32, align 4
+  %red_list = alloca i32*, align 8
+  store i32 0, i32* %retval, align 4
+  %limit = zext i32 %n to i64
+  br label %loop.ph
+
+loop.ph:
+  %iter.check = icmp ugt i64 %limit, 0
+  br i1 %iter.check, label %loop.body, label %loop.exit
+
+; XFORM: loop.body:
+; XFORM-NEXT:   %retval.0 = phi i32 [ 0, %loop.ph ], [ %retval.1, %if.end ]
+loop.body:
+  %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv
+  %pred = load i32, i32* %arrayidx, align 4
+  %tobool.not = icmp eq i32 %pred, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %rdx = load i32, i32* %retval, align 4
+  %rdx.inc = add nsw i32 %rdx, 1
+  store i32 %rdx.inc, i32* %retval, align 4
+  br label %if.end
+
+; XFORM: if.end:
+; XFORM-NEXT:   %retval.1 = phi i32 [ %retval.0, %loop.body ], [ %rdx.inc, %if.then ]
+if.end:
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %limit
+  br i1 %exitcond, label %loop.body, label %loop.exit
+
+; XFORM: loop.exit:
+; XFORM-NEXT:   %retval.2 = phi i32 [ %retval.1, %if.end ], [ 0, %loop.ph ]
+; XFORM-NEXT:   store i32 %retval.2, i32* %retval, align 4
+loop.exit:
+  br label %capture
+
+capture:
+  store i32* %retval, i32** %red_list, align 8
+  %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list)
+  %1 = load i32, i32* %retval, align 4
+  ret i32 %1
+}
+
+; DEBUG-LABEL: PartialMem2Reg on: too_many_captures
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %retval = alloca i32, align 4
+; DEBUG-NEXT: PM2R: No capture or multiple captures.
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %red_list_2 = alloca i32*, align 8
+; DEBUG-NEXT: PM2R: Unhandled uses.
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %red_list = alloca i32*, align 8
+; DEBUG-NEXT: PM2R: Unhandled uses.
+
+; VDEBUG-LABEL: LV: Checking a loop in "too_many_captures" from <stdin>
+; VDEBUG: LV: Can't vectorize the instructions or CFG
+; VDEBUG: LV: Not vectorizing: Cannot prove legality.
+
+define i32 @too_many_captures(i32* nocapture nonnull readonly %data, i32 %n) {
+entry:
+  %retval = alloca i32, align 4
+  %red_list = alloca i32*, align 8
+  %red_list_2 = alloca i32*, align 8
+  store i32 0, i32* %retval, align 4
+  %limit = zext i32 %n to i64
+  br label %loop.ph
+
+loop.ph:
+  %iter.check = icmp ugt i64 %limit, 0
+  br i1 %iter.check, label %loop.body, label %loop.exit
+
+loop.body:
+  %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv
+  %pred = load i32, i32* %arrayidx, align 4
+  %tobool.not = icmp eq i32 %pred, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %rdx = load i32, i32* %retval, align 4
+  %rdx.inc = add nsw i32 %rdx, 1
+  store i32 %rdx.inc, i32* %retval, align 4
+  br label %if.end
+
+if.end:
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %limit
+  br i1 %exitcond, label %loop.body, label %loop.exit
+
+loop.exit:
+  br label %capture
+
+capture:
+  store i32* %retval, i32** %red_list, align 8
+  %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list)
+  store i32* %retval, i32** %red_list_2, align 8
+  %1 = call i32 @capturin_ur_allocas(i32** nonnull %red_list_2)
+  %2 = load i32, i32* %retval, align 4
+  ret i32 %2
+}
+
+; DEBUG-LABEL: PartialMem2Reg on: no_captures
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %retval = alloca i32, align 4
+; DEBUG-NEXT: PM2R: No capture or multiple captures.
+
+; VDEBUG-LABEL: LV: Checking a loop in "no_captures" from <stdin>
+; VDEBUG: LV: Can't vectorize the instructions or CFG
+; VDEBUG: LV: Not vectorizing: Cannot prove legality.
+
+define i32 @no_captures(i32* nocapture nonnull readonly %data, i32 %n) {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %limit = zext i32 %n to i64
+  br label %loop.ph
+
+loop.ph:
+  %iter.check = icmp ugt i64 %limit, 0
+  br i1 %iter.check, label %loop.body, label %loop.exit
+
+loop.body:
+  %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv
+  %pred = load i32, i32* %arrayidx, align 4
+  %tobool.not = icmp eq i32 %pred, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %rdx = load i32, i32* %retval, align 4
+  %rdx.inc = add nsw i32 %rdx, 1
+  store i32 %rdx.inc, i32* %retval, align 4
+  br label %if.end
+
+if.end:
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %limit
+  br i1 %exitcond, label %loop.body, label %loop.exit
+
+loop.exit:
+  br label %nocapture
+
+nocapture:
+  %0 = load i32, i32* %retval, align 4
+  ret i32 %0
+}
+
+; DEBUG-LABEL: PartialMem2Reg on: no_postdominated_users
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %retval = alloca i32, align 4
+; DEBUG-NEXT: PM2R: No users postdominated by capture.
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %red_list = alloca i32*, align 8
+; DEBUG-NEXT: PM2R: Unhandled uses.
+
+; VDEBUG-LABEL: LV: Checking a loop in "no_postdominated_users" from <stdin>
+; VDEBUG: LV: We can vectorize this loop!
+
+define i32 @no_postdominated_users(i32* nocapture nonnull readonly %data, i32 %n) {
+entry:
+  %retval = alloca i32, align 4
+  %red_list = alloca i32*, align 8
+  %limit = zext i32 %n to i64
+  br label %loop.ph
+
+loop.ph:
+  %iter.check = icmp ugt i64 %limit, 0
+  br i1 %iter.check, label %loop.body, label %loop.exit
+
+loop.body:
+  %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv
+  %pred = load i32, i32* %arrayidx, align 4
+  %tobool.not = icmp eq i32 %pred, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  br label %if.end
+
+if.end:
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %limit
+  br i1 %exitcond, label %loop.body, label %loop.exit
+
+loop.exit:
+  br label %capture
+
+capture:
+  store i32* %retval, i32** %red_list, align 8
+  %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list)
+  store i32 0, i32* %retval, align 4
+  %1 = load i32, i32* %retval, align 4
+  ret i32 %1
+}
+
+; DEBUG-LABEL: PartialMem2Reg on: no_loops
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %retval = alloca i32, align 4
+; DEBUG-NEXT: PM2R: No users in loops.
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %red_list = alloca i32*, align 8
+; DEBUG-NEXT: PM2R: Unhandled uses.
+
+define i32 @no_loops(i32* nocapture nonnull readonly %data, i32 %n) {
+entry:
+  %retval = alloca i32, align 4
+  %red_list = alloca i32*, align 8
+  store i32 0, i32* %retval, align 4
+  %limit = zext i32 %n to i64
+  br label %capture
+
+capture:
+  store i32* %retval, i32** %red_list, align 8
+  %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list)
+  store i32 0, i32* %retval, align 4
+  %1 = load i32, i32* %retval, align 4
+  ret i32 %1
+}
+
+; DEBUG-LABEL: PartialMem2Reg on: live_block_not_postdominated
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %retval = alloca i32, align 4
+; DEBUG-NEXT: PM2R: not all live blocks are postdominated by capture.
+; DEBUG-NEXT: PM2R: Analyzing alloca:   %red_list = alloca i32*, align 8
+; DEBUG-NEXT: PM2R: Unhandled uses.
+
+; VDEBUG-LABEL: LV: Checking a loop in "live_block_not_postdominated" from <stdin>
+; VDEBUG: LV: Can't vectorize the instructions or CFG
+; VDEBUG: LV: Not vectorizing: Cannot prove legality.
+; VDEBUG: LV: Checking a loop in "live_block_not_postdominated" from <stdin>
+; VDEBUG: LV: Can't vectorize the instructions or CFG
+; VDEBUG: LV: Not vectorizing: Cannot prove legality.
+
+define i32 @live_block_not_postdominated(i32* nocapture nonnull readonly %data, i32 %n, i32 %cond1, i32 %cond2) {
+entry:
+  %retval = alloca i32, align 4
+  %red_list = alloca i32*, align 8
+  store i32 0, i32* %retval, align 4
+  %limit = zext i32 %n to i64
+  br label %cond1.check
+
+cond1.check:
+  %cmp1 = icmp ugt i32 %cond1, 77
+  br i1 %cmp1, label %loop.ph, label %loop2.ph
+
+loop.ph:
+  %iter.check = icmp ugt i64 %limit, 0
+  br i1 %iter.check, label %loop.body, label %loop2.exit
+
+loop.body:
+  %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv
+  %pred = load i32, i32* %arrayidx, align 4
+  %tobool.not = icmp eq i32 %pred, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %rdx = load i32, i32* %retval, align 4
+  %rdx.inc = add nsw i32 %rdx, 1
+  store i32 %rdx.inc, i32* %retval, align 4
+  br label %if.end
+
+if.end:
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %limit
+  br i1 %exitcond, label %loop.body, label %loop.exit
+
+loop.exit:
+  br label %capture
+
+loop2.ph:
+  %iter.check2 = icmp ugt i64 %limit, 0
+  br i1 %iter.check2, label %loop2.body, label %loop2.exit
+
+loop2.body:
+  %indvars.iv2 = phi i64 [ 0, %loop2.ph ], [ %indvars.iv.next2, %if.end2 ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %data, i64 %indvars.iv2
+  %pred2 = load i32, i32* %arrayidx2, align 4
+  %tobool.not2 = icmp eq i32 %pred2, 0
+  br i1 %tobool.not2, label %if.end2, label %if.then2
+
+if.then2:
+  %rdx2 = load i32, i32* %retval, align 4
+  %rdx.inc2 = add nsw i32 %rdx2, 1
+  store i32 %rdx.inc2, i32* %retval, align 4
+  br label %if.end2
+
+if.end2:
+  %indvars.iv.next2 = add nsw i64 %indvars.iv2, 1
+  %exitcond2 = icmp ne i64 %indvars.iv.next2, %limit
+  br i1 %exitcond2, label %loop2.body, label %loop2.exit
+
+loop2.exit:
+  br label %cond2.check
+
+cond2.check:
+  %cmp2 = icmp eq i32 %cond2, 403
+  br i1 %cmp2, label %post.capture, label %capture
+
+capture:
+  store i32* %retval, i32** %red_list, align 8
+  %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list)
+  br label %post.capture
+
+post.capture:
+  %1 = load i32, i32* %retval, align 4
+  ret i32 %1
+}
+
+declare dso_local i32 @capturin_ur_allocas(i32** nonnull)