diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,12 +19,14 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 
 using namespace llvm;
@@ -55,6 +57,8 @@
 STATISTIC(
     NumOpenMPParallelRegionsReplacedInGPUStateMachine,
     "Number of OpenMP parallel regions replaced with ID in GPU state machines");
+STATISTIC(NumOpenMPParallelRegionsMerged,
+          "Number of OpenMP parallel regions merged");
 
 #if !defined(NDEBUG)
 static constexpr auto TAG = "[" DEBUG_TYPE "]";
@@ -506,8 +510,9 @@
     // Recollect uses, in case Attributor deleted any.
     OMPInfoCache.recollectUses();
 
-    Changed |= deduplicateRuntimeCalls();
     Changed |= deleteParallelRegions();
+    Changed |= mergeParallelRegions();
+    Changed |= deduplicateRuntimeCalls();
 
     return Changed;
   }
@@ -571,6 +576,209 @@
   }
 
 private:
+  /// Merge parallel regions when it is safe.
+  bool mergeParallelRegions() {
+    const unsigned CallbackCalleeOperand = 2;
+    const unsigned CallbackFirstArgOperand = 3;
+    using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+    // Check if there are any __kmpc_fork_call calls to merge.
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+    if (!RFI.Declaration)
+      return false;
+
+    bool Changed = false;
+    LoopInfo *LI = nullptr;
+    DominatorTree *DT = nullptr;
+
+    SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
+
+    BasicBlock *StartBB, *EndBB;
+    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                         BasicBlock &ContinuationIP) {
+      BasicBlock *CGStartBB = CodeGenIP.getBlock();
+      BasicBlock *CGEndBB =
+          SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+      CGStartBB->getTerminator()->setSuccessor(0, StartBB);
+      EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+    };
+
+    auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                      Value &VPtr, Value *&ReplacementValue) -> InsertPointTy {
+      ReplacementValue = &VPtr;
+      return CodeGenIP;
+    };
+
+    auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+    // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
+    // contained in BB and only separated by instructions that can be
+    // redundantly executed in parallel. The block BB is split before the first
+    // call (in MergableCIs) and after the last so the entire region we merge
+    // into a single parallel region is contained in a single basic block
+    // without any other instructions. We use the OpenMPIRBuilder to outline
+    // that block and call the resulting function via __kmpc_fork_call.
+    auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
+      // TODO: Change the interface to allow single CIs expanded, e.g, to
+      // include an outer loop.
+      assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
+
+      Function *OriginalFn = BB->getParent();
+      LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
+                        << " parallel regions in " << OriginalFn->getName()
+                        << "\n");
+
+      // Isolate the calls to merge in a separate block.
+      EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
+      BasicBlock *AfterBB =
+          SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
+      StartBB =
+          SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, "expnd_omp_pr");
+
+      assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
+      const DebugLoc DL = BB->getTerminator()->getDebugLoc();
+      BB->getTerminator()->eraseFromParent();
+
+      OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
+                                               DL);
+      // TODO: Verify proc bind matches and use the value.
+      // TODO: Check the cancellable flag.
+      InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel(
+          Loc, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
+          OMP_PROC_BIND_default, /* IsCancellable */ false);
+      BranchInst::Create(AfterBB, AfterIP.getBlock());
+
+      // Perform the actual outlining.
+      OMPInfoCache.OMPBuilder.finalize();
+
+      Function *OutlinedFn = MergableCIs.front()->getCaller();
+
+      // Replace the __kmpc_fork_call calls with direct calls to the outlined
+      // callbacks.
+      SmallVector<Value *, 8> Args;
+      for (auto *CI : MergableCIs) {
+        Value *Callee =
+            CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
+        FunctionType *FT =
+            cast<FunctionType>(Callee->getType()->getPointerElementType());
+        Args.clear();
+        Args.push_back(OutlinedFn->getArg(0));
+        Args.push_back(OutlinedFn->getArg(1));
+        for (unsigned u = CallbackFirstArgOperand, e = CI->getNumArgOperands();
+             u < e; ++u)
+          Args.push_back(CI->getArgOperand(u));
+
+        CallInst *newCI = CallInst::Create(FT, Callee, Args, "", CI);
+        if (CI->getDebugLoc())
+          newCI->setDebugLoc(CI->getDebugLoc());
+
+        // Forward parameter attributes.
+        for (const Attribute &A :
+             OutlinedFn->getAttributes().getParamAttributes(0))
+          newCI->addParamAttr(0, A);
+        for (const Attribute &A :
+             OutlinedFn->getAttributes().getParamAttributes(1))
+          newCI->addParamAttr(1, A);
+        for (unsigned u = CallbackFirstArgOperand, e = CI->getNumArgOperands();
+             u < e; ++u)
+          for (const Attribute &A : CI->getAttributes().getParamAttributes(u))
+            newCI->addParamAttr(u - 1, A);
+
+        // Emit an explicit barrier to replace the implicit fork-join barrier.
+        if (CI != MergableCIs.back()) {
+          // TODO: Remove barrier if the merged parallel region includes the
+          // 'nowait' clause.
+          OMPInfoCache.OMPBuilder.CreateBarrier(
+              InsertPointTy(newCI->getParent(),
+                            newCI->getNextNode()->getIterator()),
+              OMPD_parallel);
+          OMPInfoCache.OMPBuilder.finalize();
+        }
+
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region in "
+                    << ore::NV("OpenMPParallelMerge",
+                               CI->getCaller()->getName())
+                    << " merged";
+        };
+        emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
+                                       Remark);
+
+        CI->eraseFromParent();
+      }
+
+      assert(OutlinedFn != OriginalFn && "Outlining failed");
+      CGUpdater.registerOutlinedFunction(*OutlinedFn);
+      CGUpdater.reanalyzeFunction(*OriginalFn);
+
+      NumOpenMPParallelRegionsMerged += MergableCIs.size();
+
+      return true;
+    };
+
+    // Helper function that performs that identifes sequences of
+    // __kmpc_fork_call uses in a basic block.
+    auto DetectPRsCB = [&](Use &U, Function &F) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      BB2PRMap[CI->getParent()].insert(CI);
+
+      LLVM_DEBUG(dbgs() << TAG << "Found parallel regions in "
+                        << BB2PRMap.size() << " blocks in " << F.getName()
+                        << "\n");
+
+      return false;
+    };
+
+    BB2PRMap.clear();
+    RFI.foreachUse(SCC, DetectPRsCB);
+    SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
+    // Find mergable parallel regions within a basic block that are
+    // safe to merge, that is any in-between instructions can safely
+    // execute in parallel after merging.
+    // TODO: support merging across basic-blocks.
+    for (auto &It : BB2PRMap) {
+      auto &CIs = It.getSecond();
+      if (CIs.size() < 2)
+        continue;
+
+      BasicBlock *BB = It.getFirst();
+      SmallVector<CallInst *, 4> MergableCIs;
+
+      // Find maximal number of parallel region CIs that are safe to merge.
+      for (Instruction &I : *BB) {
+        if (CIs.count(&I)) {
+          MergableCIs.push_back(cast<CallInst>(&I));
+          continue;
+        }
+
+        if (isSafeToSpeculativelyExecute(&I, &I, DT))
+          continue;
+
+        if (MergableCIs.size() > 1)
+          MergableCIsVector.push_back(MergableCIs);
+
+        MergableCIs.clear();
+      }
+
+      if (!MergableCIsVector.empty()) {
+        Changed = true;
+
+        for (auto &MergableCIs : MergableCIsVector)
+          Merge(MergableCIs, BB);
+      }
+    }
+
+    if (Changed) {
+      // Update RFI info to set it up for later passes.
+      RFI.clearUsesMap();
+      OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
+    }
+
+    return Changed;
+  }
+
   /// Try to delete parallel regions if possible.
   bool deleteParallelRegions() {
     const unsigned CallbackCalleeOperand = 2;
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -54,7 +54,7 @@
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn.0
 ; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #1
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @readonly() #4
+; CHECK-NEXT:    call void @readonly() #5
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -99,9 +99,16 @@
 define void @delete_parallel_1() {
 ; CHECK-LABEL: define {{[^@]+}}@delete_parallel_1()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..0 to void (i32*, i32*, ...)*))
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0)
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @delete_parallel_1..omp_par to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -126,9 +133,9 @@
 
 define internal void @.omp_outlined..0(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..0
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #5
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @readonly() #4
+; CHECK-NEXT:    call void @readonly() #5
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -138,7 +145,7 @@
 
 define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #6
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @readnone()
 ; CHECK-NEXT:    ret void
@@ -190,10 +197,16 @@
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull align 4 dereferenceable(4) [[TMP]]) #0
 ; CHECK-NEXT:    store i32 0, i32* [[A]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0)
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @delete_parallel_2..omp_par to void (i32*, i32*, ...)*), i32* [[A]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]])
 ; CHECK-NEXT:    ret void
@@ -214,9 +227,9 @@
 
 define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #6
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #7
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @omp_get_thread_num() #4
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @omp_get_thread_num() #5
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
@@ -393,7 +406,7 @@
 
 define internal void @.omp.reduction.reduction_func(i8* %arg, i8* %arg1) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
-; CHECK-SAME: (i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #10
+; CHECK-SAME: (i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #11
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast i8* [[ARG1]] to i32**
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[TMP]], align 8
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -S -attributor -openmpopt  < %s | FileCheck %s
+; RUN: opt -S -passes='attributor,cgscc(openmpopt)'  < %s | FileCheck %s
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+
+;    void merge_all() {
+;        int a = 1;
+;    #pragma omp parallel
+;        {
+;            a = 2;
+;        }
+;    #pragma omp parallel
+;        {
+;            a = 3;
+;        }
+;    }
+;
+; Merge all parallel regions
+define dso_local void @merge_all() #0 {
+; CHECK-LABEL: define {{[^@]+}}@merge_all()
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[A]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0)
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* [[A]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %a)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* %a)
+  ret void
+}
+
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
+; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 2, i32* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %a.addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 2, i32* %0, align 4
+  ret void
+}
+
+declare !callback !0 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1
+; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 3, i32* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %a.addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 3, i32* %0, align 4
+  ret void
+}
+
+;    void merge_none() {
+;        int a = 1;
+;    #pragma omp parallel
+;        {
+;            a = 2;
+;        }
+;        a = 3;
+;    #pragma omp parallel
+;        {
+;            a = 4;
+;        }
+;    }
+;
+; Do not merge parallel regions, in-between store
+; instruction is unsafe to execute in parallel
+define dso_local void @merge_none() #0 {
+; CHECK-LABEL: define {{[^@]+}}@merge_none()
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[A]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nofree nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    store i32 3, i32* [[A]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nofree nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* %a)
+  store i32 3, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* %a)
+  ret void
+}
+
+define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2
+; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 2, i32* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %a.addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 2, i32* %0, align 4
+  ret void
+}
+
+define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3
+; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 4, i32* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %a.addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 4, i32* %0, align 4
+  ret void
+}
+
+;    void merge_some() {
+;        int a = 1;
+;    #pragma omp parallel
+;        {
+;            a = 2;
+;        }
+;        a = 3;
+;    #pragma omp parallel
+;        {
+;            a = 4;
+;        }
+;    #pragma omp parallel
+;        {
+;            a = 5;
+;        }
+;    }
+;
+; Do not merge first parallel region, due to the
+; unsafe store, but merge the two next parallel
+; regions
+define dso_local void @merge_some() #0 {
+; CHECK-LABEL: define {{[^@]+}}@merge_some()
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[A]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nofree nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    store i32 3, i32* [[A]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0)
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* [[A]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* %a)
+  store i32 3, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* %a)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* %a)
+  ret void
+}
+
+define internal void @.omp_outlined..4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
+; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 2, i32* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %a.addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 2, i32* %0, align 4
+  ret void
+}
+
+define internal void @.omp_outlined..5(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a)  {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5
+; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 4, i32* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %a.addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 4, i32* %0, align 4
+  ret void
+}
+
+define internal void @.omp_outlined..6(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %a)  {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6
+; CHECK-SAME: (i32* noalias nofree [[DOTGLOBAL_TID_:%.*]], i32* noalias nofree [[DOTBOUND_TID_:%.*]], i32* nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 5, i32* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %a.addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 5, i32* %0, align 4
+  ret void
+}
+
+!0 = !{!1}
+!1 = !{i64 2, i64 -1, i64 -1, i1 true}