Index: include/llvm/Analysis/ParallelRegionInfo.h
===================================================================
--- include/llvm/Analysis/ParallelRegionInfo.h
+++ include/llvm/Analysis/ParallelRegionInfo.h
@@ -45,9 +45,20 @@
 /// @TODO The update interface to add/remove parallel regions is missing.
 ///
 class ParallelRegionInfo {
+public:
+  /// The container type used to store top level parallel regions.
+  using ParallelRegionVectorTy = SmallVector<ParallelRegion *, 4>;
+
+  /// Iterator types for top level parallel regions.
+  ///
+  ///{
+  using iterator = ParallelRegionVectorTy::iterator;
+  using const_iterator = ParallelRegionVectorTy::const_iterator;
+  ///}
 
+private:
   /// The parallel regions that are not nested in other parallel regions.
-  SmallVector<ParallelRegion *, 4> TopLevelParallelRegions;
+  ParallelRegionVectorTy TopLevelParallelRegions;
 
 public:
   ParallelRegionInfo() {}
@@ -71,14 +82,60 @@
   /// Return the top-level parallel regions in this function.
   ///
   ///{
-  SmallVectorImpl<ParallelRegion *> &getTopLevelParallelRegions() {
+  ParallelRegionVectorTy &getTopLevelParallelRegions() {
     return TopLevelParallelRegions;
   }
-  const SmallVectorImpl<ParallelRegion *> &getTopLevelParallelRegions() const {
+  const ParallelRegionVectorTy &getTopLevelParallelRegions() const {
     return TopLevelParallelRegions;
   }
   ///}
 
+  /// Return true if there is no parallel region in this function.
+  bool empty() const { return TopLevelParallelRegions.empty(); }
+
+  /// Remove all cached parallel regions of this function.
+  void clear() { TopLevelParallelRegions.clear(); }
+
+  /// Return the number of top level parallel regions in this function.
+  ParallelRegionVectorTy::size_type size() const {
+    return TopLevelParallelRegions.size();
+  }
+
+  /// Iterators for parallel regions.
+  ///
+  ///{
+  iterator begin() { return TopLevelParallelRegions.begin(); }
+  iterator end() { return TopLevelParallelRegions.end(); }
+  const_iterator begin() const { return TopLevelParallelRegions.begin(); }
+  const_iterator end() const { return TopLevelParallelRegions.end(); }
+  ///}
+
+  /// Fill @p Container with all parallel regions (not only top level).
+  void getAllParallelRegions(ParallelRegionVectorTy &Container) const;
+
+  /// Return the parallel region that makes @p L a parallel loop, if any.
+  ///
+  /// A parallel loop is a loop that does fork (parts of) its body to a new
+  /// task which are joined only after the loop. It is also ensured that
+  /// everything that is not forked but part of the loop does not cause
+  /// side-effects. These instructions usually compute the exit condition
+  /// and the new values for the induction variable(s).
+  ///
+  /// Note: We allow multiple induction variables and complex exit conditions
+  /// here. However, for some parallel runtimes these have to be simplified,
+  /// e.g. expressed with regards to one canonical induction variable.
+  /// TODO: Provide an interface to perform the simplification.
+  ///
+  /// Note: We allow arbitrary code after the loop but before a join
+  /// instruction.
+  ParallelRegion *getParallelLoopRegion(const Loop &L,
+                                        const DominatorTree &DT) const;
+
+  /// Return true if @p L is a parallel loop.
+  ///
+  /// See ParallelRegionInfo::getParallelLoopRegion for more information.
+  bool isParallelLoop(const Loop &L, const DominatorTree &DT) const;
+
   /// Check for containment in any parallel region.
   ///{
   bool containedInAny(const BasicBlock *BB, const DominatorTree &DT) const;
@@ -308,6 +365,11 @@
   }
   ///}
 
+  /// Return true if this is a parallel loop region for @p L.
+  ///
+  /// See ParallelRegionInfo::getParallelLoopRegion for more information.
+  bool isParallelLoopRegion(const Loop &L, const DominatorTree &DT) const;
+
   /// @see ParallelTask::contains(const BasicBlock *BB, DominatorTree &DT)
   bool contains(const BasicBlock *BB, const DominatorTree &DT) const;
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -58,6 +58,9 @@
 /// Initialize all passes linked into the CodeGen library.
 void initializeTarget(PassRegistry&);
 
+/// Initialize all passes linked into the PIR library.
+void initializePIROpts(PassRegistry&);
+
 void initializeAAEvalLegacyPassPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry &);
 void initializeADCELegacyPassPass(PassRegistry&);
@@ -319,6 +322,7 @@
 void initializeScalarizerPass(PassRegistry&);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
 void initializeSeparateConstOffsetFromGEPPass(PassRegistry &);
+void initializeSequentializePIRPass(PassRegistry&);
 void initializeShadowStackGCLoweringPass(PassRegistry&);
 void initializeShrinkWrapPass(PassRegistry &);
 void initializeSimpleInlinerPass(PassRegistry&);
Index: include/llvm/Transforms/Scalar/SROA.h
===================================================================
--- include/llvm/Transforms/Scalar/SROA.h
+++ include/llvm/Transforms/Scalar/SROA.h
@@ -26,6 +26,8 @@
 
 namespace llvm {
 
+class ParallelRegionInfo;
+
 /// A private "module" namespace for types and utilities used by SROA. These
 /// are implementation details and should not be used by clients.
 namespace sroa LLVM_LIBRARY_VISIBILITY {
@@ -59,6 +61,7 @@
   LLVMContext *C = nullptr;
   DominatorTree *DT = nullptr;
   AssumptionCache *AC = nullptr;
+  ParallelRegionInfo *PRI = nullptr;
 
   /// \brief Worklist of alloca instructions to simplify.
   ///
@@ -114,7 +117,7 @@
 
   /// Helper used by both the public run method and by the legacy pass.
   PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT,
-                            AssumptionCache &RunAC);
+                            AssumptionCache &RunAC, ParallelRegionInfo &RunPRI);
 
   bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS);
   AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS,
Index: include/llvm/Transforms/Utils/PromoteMemToReg.h
===================================================================
--- include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -22,6 +22,7 @@
 class DominatorTree;
 class AliasSetTracker;
 class AssumptionCache;
+class ParallelRegionInfo;
 
 /// \brief Return true if this alloca is legal for promotion.
 ///
@@ -29,7 +30,12 @@
 /// (transitively) using this alloca. This also enforces that there is only
 /// ever one layer of bitcasts or GEPs between the alloca and the lifetime
 /// markers.
-bool isAllocaPromotable(const AllocaInst *AI);
+///
+/// In case the dominance tree and the parallel region info are _both_ given
+/// we also verify that promoting the alloca does not break invariants of the
+/// parallel regions. See ParallelRegionInfo::isSafeToPromote(*) for details.
+bool isAllocaPromotable(const AllocaInst *AI, const DominatorTree *DT = nullptr,
+                        const ParallelRegionInfo *PRI = nullptr);
 
 /// \brief Promote the specified list of alloca instructions into scalar
 /// registers, inserting PHI nodes as appropriate.
Index: lib/Analysis/ParallelRegionInfo.cpp
===================================================================
--- lib/Analysis/ParallelRegionInfo.cpp
+++ lib/Analysis/ParallelRegionInfo.cpp
@@ -182,6 +182,72 @@
   ParallelSubRegions[&ParallelSubRegion.getFork()] = &ParallelSubRegion;
 }
 
+bool ParallelRegion::isParallelLoopRegion(const Loop &L,
+                                          const DominatorTree &DT) const {
+  // Bail if the fork is not part of the loop.
+  if (!L.contains(getFork().getParent()))
+    return false;
+
+  // Bail if a join is part of the loop.
+  const auto &JoinsVector = ContinuationTask.getHaltsOrJoints();
+  if (std::any_of(JoinsVector.begin(), JoinsVector.end(),
+                  [&L](TerminatorInst *TI) { return L.contains(TI); }))
+    return false;
+
+  // Now check for possible side effects outside the forked task. First the
+  // header to the fork and then the part of the continuation which is inside
+  // the loop.
+  SmallVector<BasicBlock *, 8> Worklist;
+  BasicBlock *HeaderBB = L.getHeader();
+  Worklist.push_back(HeaderBB);
+
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+    assert(L.contains(BB));
+
+    for (Instruction &I : *BB)
+      if (I.mayHaveSideEffects())
+        return false;
+
+    if (&getFork() == BB->getTerminator())
+      continue;
+
+    for (BasicBlock *SuccessorBB : successors(BB))
+      if (L.contains(SuccessorBB))
+        Worklist.push_back(SuccessorBB);
+  }
+
+  BasicBlock &ContinuationEntryBB = ContinuationTask.getEntry();
+  assert(L.contains(&ContinuationEntryBB) &&
+         "Fork instructions should not be used to exit a loop!");
+
+  // We do not support nested loops in the continuation part.
+  if (std::any_of(L.begin(), L.end(), [&](Loop *SubLoop) {
+        return ContinuationTask.contains(SubLoop, DT);
+      }))
+    return false;
+
+  // Traverse all blocks that are in the continuation and in the loop and check
+  // for side-effects.
+  assert(Worklist.empty());
+  Worklist.push_back(&ContinuationEntryBB);
+
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+    assert(L.contains(BB));
+
+    for (Instruction &I : *BB)
+      if (I.mayHaveSideEffects())
+        return false;
+
+    for (BasicBlock *SuccessorBB : successors(BB))
+      if (L.contains(SuccessorBB) && SuccessorBB != HeaderBB)
+        Worklist.push_back(SuccessorBB);
+  }
+
+  return true;
+}
+
 bool ParallelRegion::contains(const BasicBlock *BB,
                               const DominatorTree &DT) const {
   // All contained blocks are dominated by the fork block.
@@ -430,6 +496,59 @@
   return BB2PTMap;
 }
 
+void ParallelRegionInfo::getAllParallelRegions(
+    ParallelRegionVectorTy &Container) const {
+
+  Container.append(begin(), end());
+  for (unsigned i = 0; i < Container.size(); i++) {
+    ParallelRegion *PR = Container[i];
+
+    for (const auto &It : PR->getSubRegions())
+      Container.push_back(It.getSecond());
+  }
+}
+
+ParallelRegion *ParallelRegionInfo::getParallelLoopRegion(const Loop &L,
+                                        const DominatorTree &DT) const {
+  if (empty())
+    return nullptr;
+
+  SmallVector<ParallelRegion *, 8> ParallelRegions;
+  ParallelRegions.append(begin(), end());
+
+  // The parallel region we are looking for.
+  ParallelRegion *ParallelLoopRegion = nullptr;
+
+  while (!ParallelRegions.empty()) {
+    ParallelRegion *PR = ParallelRegions.pop_back_val();
+
+    for (const auto &It : PR->getSubRegions())
+      ParallelRegions.push_back(It.getSecond());
+
+    if (!PR->isParallelLoopRegion(L, DT))
+      continue;
+
+    assert(!ParallelLoopRegion &&
+           "Broken nesting resulted in multiple parallel loop regions");
+
+    ParallelLoopRegion = PR;
+
+#ifdef NDEBUG
+    // For debug builds we verify that at most one parallel loop region exists,
+    // otherwise we just assume the nesting was intact.
+    break;
+#endif
+  }
+
+  return ParallelLoopRegion;
+}
+
+bool ParallelRegionInfo::isParallelLoop(const Loop &L,
+                                        const DominatorTree &DT) const {
+  // See ParallelRegionInfo::getParallelLoopRegion(...) for more information.
+  return getParallelLoopRegion(L, DT) != nullptr;
+}
+
 bool ParallelRegionInfo::containedInAny(const BasicBlock *BB,
                                         const DominatorTree &DT) const {
   return std::any_of(
@@ -468,6 +587,8 @@
 
 bool ParallelRegionInfo::isSafeToPromote(const AllocaInst &AI,
                                          const DominatorTree &DT) const {
+  if (empty())
+    return true;
 
   // First check if we know that AI is contained in a parallel region.
   ParallelRegion *AIPR = nullptr;
Index: lib/Passes/LLVMBuild.txt
===================================================================
--- lib/Passes/LLVMBuild.txt
+++ lib/Passes/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Passes
 parent = Libraries
-required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation
+required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation PIR
Index: lib/Transforms/CMakeLists.txt
===================================================================
--- lib/Transforms/CMakeLists.txt
+++ lib/Transforms/CMakeLists.txt
@@ -5,5 +5,6 @@
 add_subdirectory(IPO)
 add_subdirectory(Vectorize)
 add_subdirectory(Hello)
+add_subdirectory(PIR)
 add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
Index: lib/Transforms/LLVMBuild.txt
===================================================================
--- lib/Transforms/LLVMBuild.txt
+++ lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC
+subdirectories = Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC PIR
 
 [component_0]
 type = Group
Index: lib/Transforms/PIR/CMakeLists.txt
===================================================================
--- /dev/null
+++ lib/Transforms/PIR/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_llvm_library(LLVMPIR
+  SequentializePIR.cpp
+  PIR.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+
+  DEPENDS
+  intrinsics_gen
+ )
Index: lib/Transforms/PIR/LLVMBuild.txt
===================================================================
--- lib/Transforms/PIR/LLVMBuild.txt
+++ lib/Transforms/PIR/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Passes/LLVMBuild.txt -------------------------------*- Conf -*--===;
+;===- ./lib/Transforms/PIR/LLVMBuild.txt -----------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,6 +17,6 @@
 
 [component_0]
 type = Library
-name = Passes
-parent = Libraries
-required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation
+name = PIR
+parent = Transforms
+required_libraries = Analysis BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation
Index: lib/Transforms/PIR/PIR.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/PIR/PIR.cpp
@@ -0,0 +1,16 @@
+//===-- PIR.cpp -----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+void llvm::initializePIROpts(PassRegistry &Registry) {
+  initializeSequentializePIRPass(Registry);
+}
Index: lib/Transforms/PIR/SequentializePIR.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/PIR/SequentializePIR.cpp
@@ -0,0 +1,156 @@
+//===-- PIR/SequentializePIR.cpp - Sequentialize parallel regions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass will lower all parallel regions to sequential IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ParallelRegionInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// This pass will lower all parallel regions to sequential IR. It replaces
+/// PIR instruction by unconditional branch instructions as follows:
+///   - fork is replaced with a branch to the forked block.
+///   - halt is replaced with a branch to the sibling continuation block.
+///   - join is replaced with a branch to its destination block.
+///
+/// If a parallel loop is encountered we annotate it as parallel using metadata.
+struct SequentializePIR : public FunctionPass {
+  static char ID;
+
+  SequentializePIR() : FunctionPass(ID) {
+    initializeSequentializePIRPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ParallelRegionInfoPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ParallelRegionInfoPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  ParallelTask::VisitorTy createParallelTaskAnnotator(MDNode *LoopID) {
+    assert(LoopID);
+
+    // Helper function that annotates all read/write instructions in a basic
+    // block with parallel memory metadata referring to LoopID.
+    ParallelTask::VisitorTy Annotator = [LoopID](BasicBlock &BB,
+                                                 const ParallelTask &) {
+      for (Instruction &I : BB) {
+        if (!I.mayReadOrWriteMemory())
+          continue;
+
+        MDNode *LoopIdMD =
+            I.getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+        if (!LoopIdMD)
+          LoopIdMD = MDNode::get(I.getContext(), {LoopID});
+        else
+          LoopIdMD = MDNode::concatenate(LoopIdMD, LoopID);
+        I.setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopIdMD);
+      }
+
+      return true;
+    };
+
+    return Annotator;
+  }
+
+  bool runOnFunction(Function &F) override {
+    ParallelRegionInfo &PRI =
+        getAnalysis<ParallelRegionInfoPass>().getParallelRegionInfo();
+
+    // Exit early if there is nothing to do.
+    if (PRI.empty())
+      return false;
+
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    const DominatorTree &DT =
+        getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+    ParallelRegionInfo::ParallelRegionVectorTy ParallelRegions;
+    PRI.getAllParallelRegions(ParallelRegions);
+
+    for (ParallelRegion *PR : ParallelRegions) {
+      Loop *L = LI.getLoopFor(PR->getFork().getParent());
+      bool AnnotateAsParallel =
+          (L && !L->isAnnotatedParallel() && PR->isParallelLoopRegion(*L, DT));
+
+      // Replace the fork, join and halt instructions by branches. Note that
+      // this does not change the dominator tree or loop info.
+      ForkInst &Fork = PR->getFork();
+      BranchInst::Create(Fork.getForkedBB(), &Fork);
+      Fork.eraseFromParent();
+
+      ParallelTask &ForkedTask = PR->getForkedTask();
+      for (TerminatorInst *TI : ForkedTask.getHaltsOrJoints()) {
+        assert(isa<HaltInst>(TI) && "Forked task was not terminated by a halt!");
+        BranchInst::Create(TI->getSuccessor(0), TI);
+        TI->eraseFromParent();
+      }
+
+      ParallelTask &ContinuationTask = PR->getContinuationTask();
+      for (TerminatorInst *TI : ContinuationTask.getHaltsOrJoints()) {
+        assert(isa<JoinInst>(TI) &&
+               "Continuation task was not terminated by a join!");
+        BranchInst::Create(TI->getSuccessor(0), TI);
+        TI->eraseFromParent();
+      }
+
+      if (!AnnotateAsParallel)
+        continue;
+
+      // Get or create a loop id for the current loop.
+      MDNode *LoopID = L->getLoopID();
+      if (!LoopID) {
+        LoopID = MDNode::get(F.getContext(), {nullptr});
+        LoopID->replaceOperandWith(0, LoopID);
+        L->setLoopID(LoopID);
+      }
+
+      // Annotate all blocks in the forked task with
+      // llvm.mem.parallel_loop_access metadata. Note that no other side-effects
+      // are allowed in the loop outside the forked task.
+      ParallelTask::VisitorTy Annotator = createParallelTaskAnnotator(LoopID);
+      ForkedTask.visit(Annotator);
+
+      // Verify that (for simplified loops) the annotation is recognized.
+      assert(!L->isLoopSimplifyForm() || L->isAnnotatedParallel());
+    }
+
+    // Update the parallel region info.
+    PRI.clear();
+    assert(PRI.empty());
+
+    return true;
+  }
+};
+}
+
+char SequentializePIR::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SequentializePIR, "sequentialize-pir",
+                      "Lower parallel regions to sequential IR", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ParallelRegionInfoPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SequentializePIR, "sequentialize-pir",
+                    "Lower parallel regions to sequential IR", false, false)
Index: lib/Transforms/Scalar/SROA.cpp
===================================================================
--- lib/Transforms/Scalar/SROA.cpp
+++ lib/Transforms/Scalar/SROA.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ParallelRegionInfo.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -4082,6 +4083,13 @@
     AI.eraseFromParent();
     return true;
   }
+
+  // For now we do not perform SROA if it might break a parallel region.
+  // TODO Perform this check on the slices and promote the ones that are safe to
+  //      promote.
+  if (!PRI->isSafeToPromote(AI, *DT))
+    return false;
+
   const DataLayout &DL = AI.getModule()->getDataLayout();
 
   // Skip alloca forms that this analysis can't handle.
@@ -4191,11 +4199,13 @@
 }
 
 PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
-                                AssumptionCache &RunAC) {
+                                AssumptionCache &RunAC,
+                                ParallelRegionInfo &RunPRI) {
   DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
   DT = &RunDT;
   AC = &RunAC;
+  PRI = &RunPRI;
 
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
@@ -4243,7 +4253,8 @@
 
 PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
   return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
-                 AM.getResult<AssumptionAnalysis>(F));
+                 AM.getResult<AssumptionAnalysis>(F),
+                 AM.getResult<ParallelRegionAnalysis>(F));
 }
 
 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
@@ -4264,12 +4275,14 @@
 
     auto PA = Impl.runImpl(
         F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+        getAnalysis<ParallelRegionInfoPass>().getParallelRegionInfo());
     return !PA.areAllPreserved();
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ParallelRegionInfoPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.setPreservesCFG();
   }
@@ -4286,5 +4299,6 @@
                       "Scalar Replacement Of Aggregates", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ParallelRegionInfoPass)
 INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
                     false, false)
Index: lib/Transforms/Utils/Mem2Reg.cpp
===================================================================
--- lib/Transforms/Utils/Mem2Reg.cpp
+++ lib/Transforms/Utils/Mem2Reg.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Transforms/Utils/Mem2Reg.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ParallelRegionInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -28,7 +29,8 @@
 STATISTIC(NumPromoted, "Number of alloca's promoted");
 
 static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
-                                    AssumptionCache &AC) {
+                                    AssumptionCache &AC,
+                                    ParallelRegionInfo &PRI) {
   std::vector<AllocaInst *> Allocas;
   BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
   bool Changed = false;
@@ -40,7 +42,7 @@
     // the entry node
     for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
       if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
-        if (isAllocaPromotable(AI))
+        if (isAllocaPromotable(AI, &DT, &PRI))
           Allocas.push_back(AI);
 
     if (Allocas.empty())
@@ -56,7 +58,8 @@
 PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  if (!promoteMemoryToRegister(F, DT, AC))
+  auto &PRI = AM.getResult<ParallelRegionAnalysis>(F);
+  if (!promoteMemoryToRegister(F, DT, AC, PRI))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
@@ -81,12 +84,15 @@
     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     AssumptionCache &AC =
         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-    return promoteMemoryToRegister(F, DT, AC);
+    ParallelRegionInfo &PRI =
+        getAnalysis<ParallelRegionInfoPass>().getParallelRegionInfo();
+    return promoteMemoryToRegister(F, DT, AC, PRI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ParallelRegionInfoPass>();
     AU.setPreservesCFG();
   }
   };
@@ -98,6 +104,7 @@
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ParallelRegionInfoPass)
 INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register",
                     false, false)
 
Index: lib/Transforms/Utils/PromoteMemoryToRegister.cpp
===================================================================
--- lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/ParallelRegionInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -48,7 +49,13 @@
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
 
-bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+bool llvm::isAllocaPromotable(const AllocaInst *AI, const DominatorTree *DT,
+                              const ParallelRegionInfo *PRI) {
+  // If the dominator tree and parallel region info were given we check if the
+  // promotion would break any parallel region requirements.
+  if (PRI && DT && !PRI->isSafeToPromote(*AI, *DT))
+    return false;
+
   // FIXME: If the memory unit is of pointer or integer type, we can permit
   // assignments to subsections of the memory unit.
   unsigned AS = AI->getType()->getAddressSpace();
Index: test/PIR/alloca_promotion.ll
===================================================================
--- /dev/null
+++ test/PIR/alloca_promotion.ll
@@ -0,0 +1,77 @@
+; RUN: opt -mem2reg -sroa -S < %s | FileCheck %s
+
+declare void @foo();
+declare void @bar(i32);
+
+; Verify we do not promote allocas that are used inside and outside a
+; parallel region.
+define i32 @alloca_used_in_seq_and_par_code() {
+entry:
+; CHECK: %local_alloca = alloca i32
+  %local_alloca = alloca i32
+  fork label %forked, %cont
+
+forked:                                       ; preds = %entry
+; CHECK: store i32 0, i32* %local_alloca
+  store i32 0, i32* %local_alloca
+  call void @foo()
+  halt label %cont
+
+cont:                                         ; preds = %entry, %forked
+  call void @foo()
+; CHECK: store i32 1, i32* %local_alloca
+  store i32 1, i32* %local_alloca
+  join label %join
+
+join:                                         ; preds = %cont
+; CHECK: %val = load i32, i32* %local_alloca
+  %val = load i32, i32* %local_alloca
+  ret i32 %val
+}
+
+; Verify we do not promote allocas even if they are used only inside a parallel
+; region but defined outside.
+define i32 @alloca_used_only_in_par_code() {
+entry:
+; CHECK: alloca i32
+  %local_alloca = alloca i32
+  fork label %forked, %cont
+
+forked:                                       ; preds = %entry
+  store i32 0, i32* %local_alloca
+  call void @foo()
+  halt label %cont
+
+cont:                                         ; preds = %entry, %forked
+  call void @foo()
+  store i32 1, i32* %local_alloca
+  %val = load i32, i32* %local_alloca
+  join label %join
+
+join:                                         ; preds = %cont
+; CHECK: ret i32 %val
+  ret i32 %val
+}
+
+; Verify we do promote allocas that are used only outside a parallel region.
+define i32 @alloca_used_only_in_seq_code() {
+entry:
+; CHECK-NOT: alloca i32
+  %local_alloca = alloca i32
+  store i32 0, i32* %local_alloca
+  fork label %forked, %cont
+
+forked:                                       ; preds = %entry
+  call void @foo()
+  halt label %cont
+
+cont:                                         ; preds = %entry, %forked
+  call void @foo()
+  %val = load i32, i32* %local_alloca
+  join label %join
+
+join:                                         ; preds = %cont
+  store i32 1, i32* %local_alloca
+; CHECK: ret i32 0
+  ret i32 %val
+}
Index: test/PIR/lower.ll
===================================================================
--- /dev/null
+++ test/PIR/lower.ll
@@ -0,0 +1,78 @@
+; RUN: opt -sequentialize-pir -S < %s | FileCheck %s
+
+declare void @foo();
+
+define void @simple() {
+entry:
+  fork label %forked, %cont
+; CHECK:      entry
+; CHECK-NEXT: br label %forked
+
+forked:
+  call void @foo()
+  halt label %cont
+; CHECK:      forked
+; CHECK-NEXT: call void @foo
+; CHECK-NEXT: br label %cont
+
+cont:
+  call void @foo()
+  join label %join
+; CHECK:      cont
+; CHECK-NEXT: call void @foo
+; CHECK-NEXT: br label %join
+
+join:
+  ret void
+}
+
+
+define void @nested() {
+entry:
+  fork label %forked.outer, %cont.outer
+; CHECK:      entry
+; CHECK-NEXT: br label %forked.outer
+
+forked.outer:
+  call void @foo()
+  fork label %forked.inner, %cont.inner
+; CHECK:      forked.outer
+; CHECK-NEXT: call void @foo
+; CHECK-NEXT: br label %forked.inner
+
+forked.inner:
+  call void @foo()
+  br label %forked.inner.body
+; CHECK:      forked.inner
+; CHECK-NEXT: call void @foo
+; CHECK-NEXT: br label %forked.inner.body
+
+forked.inner.body:
+  call void @foo()
+  halt label %cont.inner
+; CHECK:      forked.inner.body
+; CHECK-NEXT: call void @foo
+; CHECK-NEXT: br label %cont.inner
+
+cont.inner:
+  call void @foo()
+  join label %forked.outer.end
+; CHECK:      cont.inner
+; CHECK-NEXT: call void @foo
+; CHECK-NEXT: br label %forked.outer.end
+
+forked.outer.end:
+  halt label %cont.outer
+; CHECK:      forked.outer.end
+; CHECK-NEXT: br label %cont.outer
+
+cont.outer:
+  call void @foo()
+  join label %join
+; CHECK:      cont
+; CHECK-NEXT: call void @foo
+; CHECK-NEXT: br label %join
+
+join:
+  ret void
+}
Index: test/PIR/parallel_loops.ll
===================================================================
--- /dev/null
+++ test/PIR/parallel_loops.ll
@@ -0,0 +1,288 @@
+; RUN: opt -sequentialize-pir -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; void f0(int *A, int N) {
+;   int i = 0;
+;   #pragma parallel
+;   do {
+;     A[i] += 1;
+;   } while (i++ < N);
+; }
+;
+; Function Attrs: noinline nounwind uwtable
+define void @f0(i32* %A, i32 %N) #0 {
+entry:
+  %tmp = sext i32 %N to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.cond, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ 0, %entry ]
+  br label %pr.begin
+
+pr.begin:
+  fork label %forked, %do.cond
+
+forked:
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp1, 1
+  store i32 %add, i32* %arrayidx, align 4
+  halt label %do.cond
+
+; CHECK: forked:
+; CHECK:   %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID0:[0-9]*]]
+; CHECK:   store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID0]]
+; CHECK:   br label %do.cond
+
+do.cond:                                          ; preds = %do.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv, %tmp
+  br i1 %cmp, label %do.body, label %do.end
+
+; CHECK: do.cond:
+; CHECK:  br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[L0:[0-9]*]]
+
+do.end:                                           ; preds = %do.cond
+  join label %return
+
+return:
+  ret void
+}
+
+; void f1(int *A, int N) {
+;   int i = N;
+;   #pragma parallel
+;   do {
+;     A[i] += 1;
+;   } while (i-- >= 0);
+; }
+;
+; Function Attrs: noinline nounwind uwtable
+define void @f1(i32* %A, i32 %N) #0 {
+entry:
+  %tmp = sext i32 %N to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.cond, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ %tmp, %entry ]
+  br label %pr.begin
+
+pr.begin:
+  fork label %forked, %do.cond
+
+forked:
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp1, 1
+  store i32 %add, i32* %arrayidx, align 4
+  halt label %do.cond
+
+; CHECK: forked:
+; CHECK:   %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID1:[0-9]*]]
+; CHECK:   store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID1]]
+; CHECK:   br label %do.cond
+
+do.cond:                                          ; preds = %do.body
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, -1
+
+; CHECK: do.cond:
+; CHECK:  br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[L1:[0-9]*]]
+
+ br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.cond
+  join label %return
+
+return:
+  ret void
+}
+
+; void f2(int *A, int N) {
+;   int i = 0;
+;   #pragma parallel
+;   do {
+;     A[i] += 1;
+;   } while (i++ < N);
+; }
+;
+; Function Attrs: noinline nounwind uwtable
+define void @f2(i32* %A, i32 %N) #0 {
+entry:
+  %tmp = sext i32 %N to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.cond, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ 0, %entry ]
+  br label %pr.begin
+
+pr.begin:                                         ; preds = %do.body
+  fork label %forked, %do.cond
+
+forked:                                           ; preds = %pr.begin
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !0
+  %add = add nsw i32 %tmp1, 1
+  store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !0
+  halt label %do.cond
+
+; CHECK: forked:
+; CHECK:   %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID2:[0-9]*]]
+; CHECK:   store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID2]]
+; CHECK:   br label %do.cond
+
+do.cond:                                          ; preds = %forked
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv, %tmp
+  br i1 %cmp, label %do.body, label %do.end, !llvm.loop !1
+
+; CHECK: do.cond:
+; CHECK:  br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[L2:[0-9]*]]
+
+do.end:                                           ; preds = %do.cond
+  join label %return
+
+return:                                           ; preds = %do.end
+  ret void
+}
+
+; void f3(int *A, int N) {
+;   int i = N;
+;   #pragma parallel
+;   do {
+;     A[i] += 1;
+;   } while (i-- >= 0);
+; }
+;
+; CHECK: @f3
+;
+; Function Attrs: noinline nounwind uwtable
+define void @f3(i32* %A, i32 %N) #0 {
+entry:
+  %tmp = sext i32 %N to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.cond, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ %tmp, %entry ]
+  br label %pr.begin
+
+pr.begin:                                         ; preds = %do.body
+  fork label %forked, %do.cond
+
+; CHECK:      pr.begin:
+; CHECK-NEXT:   br label %forked
+
+forked:                                           ; preds = %pr.begin
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !2
+  %add = add nsw i32 %tmp1, 1
+  store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !2
+  halt label %do.cond
+
+; CHECK: forked:
+; CHECK:   %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID3:[0-9]*]]
+; CHECK:   store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID3]]
+; CHECK:   br label %do.cond
+
+do.cond:                                          ; preds = %forked
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, -1
+  br i1 %cmp, label %do.body, label %do.end, !llvm.loop !3
+
+; CHECK: do.cond:
+; CHECK:  br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[L3:[0-9]*]]
+
+do.end:                                           ; preds = %do.cond
+  join label %return
+
+return:                                           ; preds = %do.end
+  ret void
+}
+
+; void f4(int *A, int N) {
+;   int i = N;
+;   do {
+;     #pragma parallel
+;     do {
+;       A[i] += 1;
+;     } while (i-- >= 0);
+;   } while (0);
+; }
+;
+; Function Attrs: noinline nounwind uwtable
+;
+; CHECK: @f4
+define void @f4(i32* %A, i32 %N) #0 {
+entry:
+  %tmp = sext i32 %N to i64
+  br label %outer.do.body
+
+outer.do.body:
+  br label %do.body
+
+do.body:                                          ; preds = %do.cond, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ %tmp, %outer.do.body ]
+  br label %pr.begin
+
+pr.begin:                                         ; preds = %do.body
+  fork label %forked, %do.cond
+
+forked:                                           ; preds = %pr.begin
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !4
+  %add = add nsw i32 %tmp1, 1
+  store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !4
+  halt label %do.cond
+
+; CHECK:  %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID4:[0-9]*]]
+; CHECK:  store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID4]]
+
+do.cond:                                          ; preds = %forked
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, -1
+  br i1 %cmp, label %do.body, label %do.end
+
+; CHECK: do.cond
+; CHECK:   br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[LInner4:[0-9]*]]
+
+do.end:                                           ; preds = %do.cond
+  join label %outer.do.cond
+
+outer.do.cond:
+  br i1 false, label %outer.do.body, label %outer.do.end, !llvm.loop !5
+
+; CHECK: outer.do.cond
+; CHECK:  br i1 false, label %outer.do.body, label %outer.do.end, !llvm.loop ![[LOuter4:[0-9]*]]
+
+outer.do.end:
+  br label %return
+
+return:                                           ; preds = %do.end
+  ret void
+}
+
+!0 = !{!1}
+!1 = distinct !{!1}
+!2 = !{!3}
+!3 = distinct !{!3}
+!4 = !{!5}
+!5 = distinct !{!5}
+
+; CHECK:  ![[PID0]] = !{![[L0]]}
+; CHECK:  ![[L0]] = distinct !{![[L0]]}
+
+; CHECK:  ![[PID1]] = !{![[L1]]}
+; CHECK:  ![[L1]] = distinct !{![[L1]]}
+
+; CHECK:  ![[PID2]] = !{![[L2]]}
+; CHECK:  ![[L2]] = distinct !{![[L2]]}
+
+; CHECK:  ![[PID3]] = !{![[L3]]}
+; CHECK:  ![[L3]] = distinct !{![[L3]]}
+
+; CHECK:  ![[PID4]] = !{![[LOuter4]], ![[LInner4]]}
+; CHECK:  ![[LOuter4]] = distinct !{![[LOuter4]]}
+; CHECK:  ![[LInner4]] = distinct !{![[LInner4]]}
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: tools/opt/CMakeLists.txt
===================================================================
--- tools/opt/CMakeLists.txt
+++ tools/opt/CMakeLists.txt
@@ -6,6 +6,7 @@
   Core
   Coroutines
   IPO
+  PIR
   IRReader
   InstCombine
   Instrumentation
Index: tools/opt/opt.cpp
===================================================================
--- tools/opt/opt.cpp
+++ tools/opt/opt.cpp
@@ -378,6 +378,7 @@
   initializeInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
+  initializePIROpts(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
   initializeCodeGenPreparePass(Registry);