diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,12 +19,14 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 
 using namespace llvm;
@@ -55,6 +57,8 @@
 STATISTIC(
     NumOpenMPParallelRegionsReplacedInGPUStateMachine,
     "Number of OpenMP parallel regions replaced with ID in GPU state machines");
+STATISTIC(NumOpenMPParallelRegionsMerged,
+          "Number of OpenMP parallel regions merged");
 
 #if !defined(NDEBUG)
 static constexpr auto TAG = "[" DEBUG_TYPE "]";
@@ -506,8 +510,9 @@
     // Recollect uses, in case Attributor deleted any.
     OMPInfoCache.recollectUses();
 
-    Changed |= deduplicateRuntimeCalls();
     Changed |= deleteParallelRegions();
+    Changed |= mergeParallelRegions();
+    Changed |= deduplicateRuntimeCalls();
 
     return Changed;
   }
@@ -571,6 +576,209 @@
   }
 
 private:
+  /// Merge parallel regions when it is safe.
+  bool mergeParallelRegions() {
+    const unsigned CallbackCalleeOperand = 2;
+    const unsigned CallbackFirstArgOperand = 3;
+    using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+    // Check if there are any __kmpc_fork_call calls to merge.
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+    if (!RFI.Declaration)
+      return false;
+
+    bool Changed = false;
+    LoopInfo *LI = nullptr;
+    DominatorTree *DT = nullptr;
+
+    SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
+
+    BasicBlock *StartBB, *EndBB;
+    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                         BasicBlock &ContinuationIP) {
+      BasicBlock *CGStartBB = CodeGenIP.getBlock();
+      BasicBlock *CGEndBB =
+          SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+      CGStartBB->getTerminator()->setSuccessor(0, StartBB);
+      EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+    };
+
+    auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                      Value &VPtr, Value *&ReplacementValue) -> InsertPointTy {
+      ReplacementValue = &VPtr;
+      return CodeGenIP;
+    };
+
+    auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+    // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
+    // contained in BB and only separated by instructions that can be
+    // redundantly executed in parallel. The block BB is split before the first
+    // call (in MergableCIs) and after the last so the entire region we merge
+    // into a single parallel region is contained in a single basic block
+    // without any other instructions. We use the OpenMPIRBuilder to outline
+    // that block and call the resulting function via __kmpc_fork_call.
+    auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
+      // TODO: Change the interface to allow single CIs expanded, e.g, to
+      // include an outer loop.
+      assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
+
+      Function *OriginalFn = BB->getParent();
+      LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
+                        << " parallel regions in " << OriginalFn->getName()
+                        << "\n");
+
+      // Isolate the calls to merge in a separate block.
+      EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
+      BasicBlock *AfterBB =
+          SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
+      StartBB =
+          SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, "expnd_omp_pr");
+
+      assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
+      const DebugLoc DL = BB->getTerminator()->getDebugLoc();
+      BB->getTerminator()->eraseFromParent();
+
+      OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
+                                               DL);
+      // TODO: Verify proc bind matches and use the value.
+      // TODO: Check the cancellable flag.
+      InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel(
+          Loc, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
+          OMP_PROC_BIND_default, /* IsCancellable */ false);
+      BranchInst::Create(AfterBB, AfterIP.getBlock());
+
+      // Perform the actual outlining.
+      OMPInfoCache.OMPBuilder.finalize();
+
+      Function *OutlinedFn = MergableCIs.front()->getCaller();
+
+      // Replace the __kmpc_fork_call calls with direct calls to the outlined
+      // callbacks.
+      SmallVector<Value *, 8> Args;
+      for (auto *CI : MergableCIs) {
+        Value *Callee =
+            CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
+        FunctionType *FT =
+            cast<FunctionType>(Callee->getType()->getPointerElementType());
+        Args.clear();
+        Args.push_back(OutlinedFn->getArg(0));
+        Args.push_back(OutlinedFn->getArg(1));
+        for (unsigned u = CallbackFirstArgOperand, e = CI->getNumArgOperands();
+             u < e; ++u)
+          Args.push_back(CI->getArgOperand(u));
+
+        CallInst *newCI = CallInst::Create(FT, Callee, Args, "", CI);
+        if (CI->getDebugLoc())
+          newCI->setDebugLoc(CI->getDebugLoc());
+
+        // Forward parameter attributes.
+        for (const Attribute &A :
+             OutlinedFn->getAttributes().getParamAttributes(0))
+          newCI->addParamAttr(0, A);
+        for (const Attribute &A :
+             OutlinedFn->getAttributes().getParamAttributes(1))
+          newCI->addParamAttr(1, A);
+        for (unsigned u = CallbackFirstArgOperand, e = CI->getNumArgOperands();
+             u < e; ++u)
+          for (const Attribute &A : CI->getAttributes().getParamAttributes(u))
+            newCI->addParamAttr(u - 1, A);
+
+        // Emit an explicit barrier to replace the implicit fork-join barrier.
+        if (CI != MergableCIs.back()) {
+          // TODO: Remove barrier if the merged parallel region includes the
+          // 'nowait' clause.
+          OMPInfoCache.OMPBuilder.CreateBarrier(
+              InsertPointTy(newCI->getParent(),
+                            newCI->getNextNode()->getIterator()),
+              OMPD_parallel);
+          OMPInfoCache.OMPBuilder.finalize();
+        }
+
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region in "
+                    << ore::NV("OpenMPParallelMerge",
+                               CI->getCaller()->getName())
+                    << " merged";
+        };
+        emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
+                                       Remark);
+
+        CI->eraseFromParent();
+      }
+
+      assert(OutlinedFn != OriginalFn && "Outlining failed");
+      CGUpdater.registerOutlinedFunction(*OutlinedFn);
+      CGUpdater.reanalyzeFunction(*OriginalFn);
+
+      NumOpenMPParallelRegionsMerged += MergableCIs.size();
+
+      return true;
+    };
+
+    // Helper function that performs that identifes sequences of
+    // __kmpc_fork_call uses in a basic block.
+    auto DetectPRsCB = [&](Use &U, Function &F) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      BB2PRMap[CI->getParent()].insert(CI);
+
+      LLVM_DEBUG(dbgs() << TAG << "Found parallel regions in "
+                        << BB2PRMap.size() << " blocks in " << F.getName()
+                        << "\n");
+
+      return false;
+    };
+
+    BB2PRMap.clear();
+    RFI.foreachUse(SCC, DetectPRsCB);
+    SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
+    // Find mergable parallel regions within a basic block that are
+    // safe to merge, that is any in-between instructions can safely
+    // execute in parallel after merging.
+    // TODO: support merging across basic-blocks.
+    for (auto &It : BB2PRMap) {
+      auto &CIs = It.getSecond();
+      if (CIs.size() < 2)
+        continue;
+
+      BasicBlock *BB = It.getFirst();
+      SmallVector<CallInst *, 4> MergableCIs;
+
+      // Find maximal number of parallel region CIs that are safe to merge.
+      for (Instruction &I : *BB) {
+        if (CIs.count(&I)) {
+          MergableCIs.push_back(cast<CallInst>(&I));
+          continue;
+        }
+
+        if (isSafeToSpeculativelyExecute(&I, &I, DT))
+          continue;
+
+        if (MergableCIs.size() > 1)
+          MergableCIsVector.push_back(MergableCIs);
+
+        MergableCIs.clear();
+      }
+
+      if (!MergableCIsVector.empty()) {
+        Changed = true;
+
+        for (auto &MergableCIs : MergableCIsVector)
+          Merge(MergableCIs, BB);
+      }
+    }
+
+    if (Changed) {
+      // Update RFI info to set it up for later passes.
+      RFI.clearUsesMap();
+      OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
+    }
+
+    return Changed;
+  }
+
   /// Try to delete parallel regions if possible.
   bool deleteParallelRegions() {
     const unsigned CallbackCalleeOperand = 2;
@@ -1299,8 +1507,10 @@
   // TODO: Compute the module slice we are allowed to look at.
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run();
-  (void)Changed;
-  return PreservedAnalyses::all();
+  if (Changed)
+    return PreservedAnalyses::none();
+  else
+    return PreservedAnalyses::all();
 }
 
 namespace {
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -39,24 +39,47 @@
 }
 
 define internal void @.omp_outlined.willreturn(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #0
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @unknown() #0
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @unknown() willreturn
   ret void
 }
 
 define internal void @.omp_outlined.willreturn.0(i32* noalias %.global_tid., i32* noalias %.bound_tid.) willreturn {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn.0
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @readonly() #5
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @readonly()
   ret void
 }
 
 define internal void @.omp_outlined.willreturn.1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn.1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #2
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @readnone() #0
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @readnone() willreturn
   ret void
 }
 
 define internal void @.omp_outlined.willreturn.2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.willreturn.2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #3
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
 entry:
   ret void
 }
@@ -76,9 +99,14 @@
 define void @delete_parallel_1() {
 ; CHECK-LABEL: define {{[^@]+}}@delete_parallel_1()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..0 to void (i32*, i32*, ...)*))
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0)
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @delete_parallel_1..omp_par to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -90,24 +118,47 @@
 }
 
 define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]])
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @unknown()
   ret void
 }
 
 define internal void @.omp_outlined..0(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..0
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #5
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @readonly() #5
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @readonly()
   ret void
 }
 
 define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #6
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @readnone()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @readnone()
   ret void
 }
 
 define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #3
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
 entry:
   ret void
 }
@@ -144,10 +195,14 @@
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull align 4 dereferenceable(4) [[TMP]]) #0
 ; CHECK-NEXT:    store i32 0, i32* [[A]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0)
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @delete_parallel_2..omp_par to void (i32*, i32*, ...)*), i32* [[A]])
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]])
 ; CHECK-NEXT:    ret void
@@ -167,6 +222,20 @@
 }
 
 define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #7
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @omp_get_thread_num() #5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP]], 1
+; CHECK-NEXT:    store i32 [[INC]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %call = call i32 @omp_get_thread_num()
   %cmp = icmp eq i32 %call, 0
@@ -183,6 +252,22 @@
 }
 
 define internal void @.omp_outlined..4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
+; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]])
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* nonnull @0, i32 [[TMP]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
+; CHECK:       omp_if.then:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT:    store i32 [[INC]], i32* [[A]], align 4
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* nonnull @0, i32 [[TMP]])
+; CHECK-NEXT:    br label [[OMP_IF_END]]
+; CHECK:       omp_if.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %tmp = load i32, i32* %.global_tid., align 4
   %tmp1 = call i32 @__kmpc_master(%struct.ident_t* nonnull @0, i32 %tmp)
@@ -209,6 +294,24 @@
 declare void @__kmpc_end_master(%struct.ident_t*, i32)
 
 define internal void @.omp_outlined..5(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5
+; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]])
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0)
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* nonnull @0, i32 [[TMP]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
+; CHECK:       omp_if.then:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT:    store i32 [[INC]], i32* [[A]], align 4
+; CHECK-NEXT:    call void @__kmpc_end_single(%struct.ident_t* nonnull @0, i32 [[TMP]])
+; CHECK-NEXT:    br label [[OMP_IF_END]]
+; CHECK:       omp_if.end:
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* nonnull @1, i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %omp_global_thread_num = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0)
   %tmp = load i32, i32* %.global_tid., align 4
@@ -229,6 +332,39 @@
 }
 
 define internal void @.omp_outlined..6(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6
+; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]])
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = bitcast i32* [[A1]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull align 4 [[TMP]]) #0
+; CHECK-NEXT:    store i32 1, i32* [[A1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i32**
+; CHECK-NEXT:    store i32* [[A1]], i32** [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(%struct.ident_t* nonnull @2, i32 [[TMP2]], i32 1, i64 8, i8* nonnull align 8 [[TMP3]], void (i8*, i8*)* nonnull @.omp.reduction.reduction_func, [8 x i32]* nonnull @.gomp_critical_user_.reduction.var)
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       .omp.reduction.case1:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[A]], align 4
+; CHECK-NEXT:    call void @__kmpc_end_reduce_nowait(%struct.ident_t* nonnull @2, i32 [[TMP2]], [8 x i32]* nonnull @.gomp_critical_user_.reduction.var)
+; CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+; CHECK:       .omp.reduction.case2:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[A1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = atomicrmw add i32* [[A]], i32 [[TMP7]] monotonic
+; CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+; CHECK:       .omp.reduction.default:
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[A1]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP9]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %a1 = alloca i32, align 4
   %.omp.reduction.red_list = alloca [1 x i8*], align 8
@@ -241,8 +377,8 @@
   %tmp3 = bitcast [1 x i8*]* %.omp.reduction.red_list to i8*
   %tmp4 = call i32 @__kmpc_reduce_nowait(%struct.ident_t* nonnull @2, i32 %tmp2, i32 1, i64 8, i8* nonnull %tmp3, void (i8*, i8*)* nonnull @.omp.reduction.reduction_func, [8 x i32]* nonnull @.gomp_critical_user_.reduction.var)
   switch i32 %tmp4, label %.omp.reduction.default [
-    i32 1, label %.omp.reduction.case1
-    i32 2, label %.omp.reduction.case2
+  i32 1, label %.omp.reduction.case1
+  i32 2, label %.omp.reduction.case2
   ]
 
 .omp.reduction.case1:                             ; preds = %entry
@@ -265,6 +401,19 @@
 }
 
 define internal void @.omp.reduction.reduction_func(i8* %arg, i8* %arg1) {
+; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
+; CHECK-SAME: (i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], i8* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #11
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = bitcast i8* [[ARG1]] to i32**
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARG]] to i32**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %tmp = bitcast i8* %arg1 to i32**
   %tmp2 = load i32*, i32** %tmp, align 8
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -0,0 +1,254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -S -attributor -openmpopt  < %s | FileCheck %s
+; RUN: opt -S -passes='attributor,cgscc(openmpopt)'  < %s | FileCheck %s
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+
+;    void merge_all() {
+;    #pragma omp parallel
+;        {
+;        }
+;    #pragma omp parallel
+;        {
+;        }
+;    }
+;
+; Merge all parallel regions
+define dso_local void @merge_all() #0 {
+; CHECK-LABEL: define {{[^@]+}}@merge_all() #0
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0)
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*))
+  ret void
+}
+
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  ret void
+}
+
+declare !callback !2 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) #2
+
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  ret void
+}
+;    void merge_none() {
+;        int a = 1;
+;    #pragma omp parallel
+;        {
+;        }
+;        a = 2;
+;    #pragma omp parallel
+;        {
+;        }
+;    }
+;
+; Do not merge parallel regions, in-between store
+; instruction is unsafe to execute in parallel
+define dso_local void @merge_none() #0 {
+; CHECK-LABEL: define {{[^@]+}}@merge_none() #0
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[A]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*))
+  store i32 2, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*))
+  ret void
+}
+
+define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  ret void
+}
+
+define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  ret void
+}
+
+;    void merge_some() {
+;        int a = 1;
+;    #pragma omp parallel
+;        {
+;        }
+;        a = 2;
+;    #pragma omp parallel
+;        {
+;        }
+;    #pragma omp parallel
+;        {
+;        }
+;    }
+;
+; Do not merge first parallel region, due to the
+; unsafe store, but merge the two next parallel
+; regions
+define dso_local void @merge_some() #0 {
+; CHECK-LABEL: define {{[^@]+}}@merge_some() #0
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[A]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @0)
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*))
+  store i32 2, i32* %a, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*))
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*))
+  ret void
+}
+
+define internal void @.omp_outlined..4(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  ret void
+}
+
+define internal void @.omp_outlined..5(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  ret void
+}
+
+define internal void @.omp_outlined..6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #1 {
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6
+; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+; CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  ret void
+}
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 11.0.0 (git@github.com:ggeorgakoudis/llvm-project.git e606e9aa891b114da1e9a0dbcede01dfc077f3f8)"}
+!2 = !{!3}
+!3 = !{i64 2, i64 -1, i64 -1, i1 true}