diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -36,7 +36,10 @@
   void initialize();
 
   /// Finalize the underlying module, e.g., by outlining regions.
-  void finalize();
+  /// \param AllowExtractorSinking Flag to include sinking instructions,
+  ///                              emitted by CodeExtractor, in the
+  ///                              outlined region. Default is false.
+  void finalize(bool AllowExtractorSinking = false);
 
   /// Add attributes known for \p FnID to \p Fn.
   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -127,7 +127,7 @@
 
 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
 
-void OpenMPIRBuilder::finalize() {
+void OpenMPIRBuilder::finalize(bool AllowExtractorSinking) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
   for (OutlineInfo &OI : OutlineInfos) {
@@ -170,6 +170,22 @@
       BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
       assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
       assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
+      if (AllowExtractorSinking) {
+        // Move instructions from the to-be-deleted ArtificialEntry to the entry
+        // basic block of the parallel region. CodeExtractor may have sunk
+        // allocas/bitcasts for values that are solely used in the outlined
+        // region and do not escape.
+        for (BasicBlock::iterator It = ArtificialEntry.begin();
+             It != ArtificialEntry.end();) {
+          Instruction &I = *It;
+          It++;
+
+          if (I.isTerminator())
+            continue;
+
+          I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
+        }
+      }
       OI.EntryBB->moveBefore(&ArtificialEntry);
       ArtificialEntry.eraseFromParent();
     }
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 
 using namespace llvm;
 using namespace omp;
@@ -602,15 +603,11 @@
     if (!RFI.Declaration)
       return false;
 
-    // Check if there any __kmpc_push_proc_bind calls for explicit affinities.
-    OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI =
-        OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind];
-
-    // Defensively abort if explicit affinities are set.
-    // TODO: Track ICV proc_bind to merge when mergable regions have the same
-    // affinity.
-    if (ProcBindRFI.Declaration)
-      return false;
+    // Unmergable calls that prevent merging a parallel region.
+    OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
+        OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
+        OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
+    };
 
     bool Changed = false;
     LoopInfo *LI = nullptr;
@@ -638,6 +635,151 @@
 
     auto FiniCB = [&](InsertPointTy CodeGenIP) {};
 
+    auto CreateSequentialRegion =
+        [&](Function *OuterFn, BasicBlock *OuterPredBB,
+            SmallDenseMap<Value *, Value *> &ReplacementInputMap,
+            Instruction *SeqStartI, Instruction *SeqEndI) {
+          // Isolate the instructions of the sequential region to a separate
+          // block.
+          BasicBlock *ParentBB = SeqStartI->getParent();
+          BasicBlock *SeqEndBB =
+              SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
+          BasicBlock *SeqAfterBB =
+              SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
+          BasicBlock *SeqStartBB = SplitBlock(ParentBB, SeqStartI, DT, LI,
+                                              nullptr, "seq.par.merged");
+
+          assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
+                 "Expected a different CFG");
+          const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
+          ParentBB->getTerminator()->eraseFromParent();
+
+          auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                               BasicBlock &ContinuationIP) {
+            BasicBlock *CGStartBB = CodeGenIP.getBlock();
+            BasicBlock *CGEndBB =
+                SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+            assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
+            CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
+            assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
+            SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
+          };
+          auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+          // Find inputs/outputs to/from the sequential region. Convert inputs
+          // to pointer uses in the sequential region. Broadcast outputs to
+          // users outside the soon-to-be-merged region. Sinking/hoisting
+          // candidates do not require any action, they will be handled by
+          // OMPIRBuilder when extracting the merged outlined region.
+          CodeExtractorAnalysisCache CEAC(*OuterFn);
+          CodeExtractor Extractor({SeqStartBB}, /* DominatorTree */ nullptr,
+                                  /* AggregateArgs */ false,
+                                  /* BlockFrequencyInfo */ nullptr,
+                                  /* BranchProbabilityInfo */ nullptr,
+                                  /* AssumptionCache */ nullptr,
+                                  /* AllowVarArgs */ true,
+                                  /* AllowAlloca */ true,
+                                  /* Suffix */ ".seq");
+
+          BasicBlock *CommonExit = nullptr;
+          SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+          Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+          Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+          for (Value *V : Inputs) {
+            if (ReplacementInputMap.count(V)) {
+              // Value is not replaced, so continue.
+              if (ReplacementInputMap[V] == V)
+                continue;
+            }
+
+            // Collect user instructions in the sequential BB.
+            SmallPtrSet<Instruction *, 8> UsersI;
+            for (User *U : V->users()) {
+              Instruction *UI = dyn_cast<Instruction>(U);
+              if (!UI)
+                continue;
+              if (UI->getParent() != SeqStartBB)
+                continue;
+
+              UsersI.insert(UI);
+            }
+
+            // Emit an alloca and a matching store for the input replacement
+            // value in the outer function.
+            if (!ReplacementInputMap.count(V)) {
+              const DataLayout &DL = M.getDataLayout();
+              AllocaInst *AllocaI = new AllocaInst(
+                  V->getType(), DL.getAllocaAddrSpace(), nullptr,
+                  V->getName() + ".seq.input.alloc", &OuterFn->front().front());
+
+              new StoreInst(V, AllocaI, OuterPredBB);
+
+              // Replace value with this alloca, will be also used for later
+              // sequential regions.
+              ReplacementInputMap[V] = AllocaI;
+            }
+
+            LoadInst *LoadI = new LoadInst(V->getType(), ReplacementInputMap[V],
+                                           V->getName() + ".seq.input.load",
+                                           &*SeqStartBB->getFirstInsertionPt());
+
+            for (Instruction *I : UsersI)
+              I->replaceUsesOfWith(V, LoadI);
+          }
+
+          for (Value *V : Outputs) {
+            SmallPtrSet<Instruction *, 8> EscapingUsers;
+
+            // Find all escaping users, outside this sequential region.
+            for (User *U : V->users()) {
+              Instruction *UI = dyn_cast<Instruction>(U);
+              if (!UI)
+                continue;
+
+              if (UI->getParent() == SeqStartBB)
+                continue;
+
+              EscapingUsers.insert(UI);
+            }
+
+            // Emit an alloca in the outer region to store the broadcasted
+            // value.
+            const DataLayout &DL = M.getDataLayout();
+            AllocaInst *AllocaI = new AllocaInst(
+                V->getType(), DL.getAllocaAddrSpace(), nullptr,
+                V->getName() + ".seq.output.alloc", &OuterFn->front().front());
+
+            // Emit a store instruction in the sequential BB to update the
+            // value.
+            new StoreInst(V, AllocaI, SeqStartBB->getTerminator());
+
+            // If a later merged sequential regions has this value as an input,
+            // it will directly use the alloca.
+            ReplacementInputMap[AllocaI] = AllocaI;
+
+            // Emit a load instruction and replace the use of the escaping value
+            // with it.
+            for (Instruction *UI : EscapingUsers) {
+              LoadInst *LoadI = new LoadInst(
+                  V->getType(), AllocaI, V->getName() + ".seq.output.load", UI);
+              UI->replaceUsesOfWith(V, LoadI);
+            }
+          }
+
+          OpenMPIRBuilder::LocationDescription Loc(
+              InsertPointTy(ParentBB, ParentBB->end()), DL);
+          InsertPointTy SeqAfterIP =
+              OMPInfoCache.OMPBuilder.CreateMaster(Loc, BodyGenCB, FiniCB);
+
+          OMPInfoCache.OMPBuilder.CreateBarrier(SeqAfterIP, OMPD_parallel);
+
+          BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
+
+          LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
+                            << "\n");
+        };
+
     // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
     // contained in BB and only separated by instructions that can be
     // redundantly executed in parallel. The block BB is split before the first
@@ -683,6 +825,32 @@
       const DebugLoc DL = BB->getTerminator()->getDebugLoc();
       BB->getTerminator()->eraseFromParent();
 
+      // Initialize the map of replacement inputs used by sequential regions
+      // within the merged parallel region. Input values that are arguments to
+      // already outlined parallel regions to-be-merged are already allocated in
+      // the outer function, so they map to themselves. Sequential regions that
+      // have those values as inputs directly use them.
+      SmallDenseMap<Value *, Value *> ReplacementInputMap;
+      for (auto *It = MergableCIs.begin(); It != MergableCIs.end(); ++It) {
+        CallInst *ForkCI = *It;
+        for (Value *V : ForkCI->args())
+          ReplacementInputMap[V] = V;
+      }
+      // Create sequential regions for sequential instructions that are
+      // in-between mergable parallel regions.
+      for (auto *It = MergableCIs.begin(); It != MergableCIs.end() - 1; ++It) {
+        Instruction *ForkCI = *It;
+        Instruction *NextForkCI = *(It + 1);
+
+        // Continue if there are not in-between instructions.
+        if (ForkCI->getNextNode() == NextForkCI)
+          continue;
+
+        CreateSequentialRegion(OriginalFn, BB, ReplacementInputMap,
+                               ForkCI->getNextNode(),
+                               NextForkCI->getPrevNode());
+      }
+
       OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
                                                DL);
       IRBuilder<>::InsertPoint AllocaIP(
@@ -696,7 +864,7 @@
       BranchInst::Create(AfterBB, AfterIP.getBlock());
 
       // Perform the actual outlining.
-      OMPInfoCache.OMPBuilder.finalize();
+      OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
 
       Function *OutlinedFn = MergableCIs.front()->getCaller();
 
@@ -783,16 +951,64 @@
       BasicBlock *BB = It.getFirst();
       SmallVector<CallInst *, 4> MergableCIs;
 
+      auto isMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
+        if (isa<CallInst>(&I)) {
+          if (IsBeforeMergableRegion) {
+            CallInst *CI = cast<CallInst>(&I);
+            Function *CalledFunction = CI->getCalledFunction();
+            // Return false (unmergable) if the call before the parallel
+            // region calls an explicit affinity (proc_bind) or number of
+            // threads (num_threads) compiler-generated function. Those settings
+            // may be incompatible with following parallel regions.
+            // TODO: ICV tracking to detect compatibility.
+            for (auto RFI : UnmergableCallsInfo) {
+              if (CalledFunction == RFI.Declaration)
+                return false;
+            }
+          } else {
+            // Return false (unmergable) if there is a call instruction
+            // in-between parallel regions when it is not an LT intrinsic. It
+            // may call an unmergable OpenMP runtime function in its callpath.
+            // TODO: Keep track of possible OpenMP calls in the callpath.
+            if (!I.isLifetimeStartOrEnd())
+              return false;
+          }
+        }
+
+        // We do not merge across BBs, hence return false (unmergable) if the
+        // instruction is a terminator.
+        if (I.isTerminator())
+          return false;
+
+        return true;
+      };
       // Find maximal number of parallel region CIs that are safe to merge.
-      for (Instruction &I : *BB) {
+      for (auto It = BB->begin(); It != BB->end();) {
+        Instruction &I = *It;
+        It++;
+
         if (CIs.count(&I)) {
           MergableCIs.push_back(cast<CallInst>(&I));
           continue;
         }
 
-        if (isSafeToSpeculativelyExecute(&I, &I, DT))
+        // Continue expanding if the instruction is mergable.
+        if (isMergable(I, MergableCIs.empty()))
           continue;
 
+        // Forward the instruction iterator to skip the next parallel region
+        // since there is an unmergable instruction which can affect it.
+        for (; It != BB->end(); It++) {
+          Instruction &SkipI = *It;
+          if (CIs.count(&SkipI)) {
+            LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
+                              << " due to " << I << "\n");
+            It++;
+            break;
+          }
+        }
+
+        // Store mergable regions found.
         if (MergableCIs.size() > 1) {
           MergableCIsVector.push_back(MergableCIs);
           LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
@@ -817,11 +1033,22 @@
       RFI.clearUsesMap();
       OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
 
-      // Collect uses for the emitted barrier call.
+      // Collect uses for emitted barrier calls.
       OMPInformationCache::RuntimeFunctionInfo &BarrierRFI =
           OMPInfoCache.RFIs[OMPRTL___kmpc_barrier];
       BarrierRFI.clearUsesMap();
       OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false);
+
+      // Collect uses for any emitted master/end_master calls.
+      OMPInformationCache::RuntimeFunctionInfo &MasterRFI =
+          OMPInfoCache.RFIs[OMPRTL___kmpc_master];
+      MasterRFI.clearUsesMap();
+      OMPInfoCache.collectUses(MasterRFI, /* CollectStats */ false);
+
+      OMPInformationCache::RuntimeFunctionInfo &EndMasterRFI =
+          OMPInfoCache.RFIs[OMPRTL___kmpc_end_master];
+      EndMasterRFI.clearUsesMap();
+      OMPInfoCache.collectUses(EndMasterRFI, /* CollectStats */ false);
     }
 
     return Changed;
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
--- a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -1,6 +1,232 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging  < %s | FileCheck %s
 ; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging  < %s | FileCheck %s
-
+; #include <omp.h>
+; void foo();
+; void use(int);
+; void usef(float);
+; void merge(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_proc_bind(int a) {
+; #pragma omp parallel proc_bind(close)
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_num_threads(int a) {
+; #pragma omp parallel num_threads(a)
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_seq_call(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     foo();
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void merge_seq(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     a = a + 1;
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     use(a);
+; }
+; void merge_seq_float(float f, float *p) {
+; #pragma omp parallel
+;     {
+;         use(f);
+;     }
+;     *p = f + 3.14f;
+; #pragma omp parallel
+;     {
+;         use(f);
+;     }
+; }
+; void merge_seq_firstprivate(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     a = a + 1;
+; #pragma omp parallel firstprivate(a)
+;     {
+;         use(a);
+;     }
+;     use(a);
+; }
+; void merge_seq_sink_lt(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     {
+;         int b = (int)&b;
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void merge_seq_par_use(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     int b = a + 1;
+; #pragma omp parallel
+;     {
+;         use(a);
+;         use(b);
+;     }
+; }
+; void merge_cancellable_regions(int cancel1, int cancel2)
+; {
+; #pragma omp parallel
+;     {
+;         if(cancel1) {
+; #pragma omp cancel parallel
+;         }
+;     }
+; #pragma omp parallel
+;     {
+;         if (cancel2) {
+; #pragma omp cancel parallel
+;         }
+;     }
+; }
+; void merge_cancellable_regions_seq(int cancel1, int cancel2)
+; {
+; #pragma omp parallel
+;     {
+;         if(cancel1) {
+; #pragma omp cancel parallel
+;         }
+;     }
+;     cancel2 = !cancel1;
+; #pragma omp parallel
+;     {
+;         if (cancel2) {
+; #pragma omp cancel parallel
+;         }
+;     }
+; }
+; void merge_3(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void merge_3_seq(int a, int b) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     b = a + 1;
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     b = b + a;
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     use(b);
+; }
+; void unmergable_3_seq_call(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     foo();
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     foo();
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_3_proc_bind(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel proc_bind(close)
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_3_num_threads(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel num_threads(a)
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void merge_2_unmergable_1(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     foo();
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
@@ -8,219 +234,583 @@
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
 
-;   void merge_all() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;   #pragma omp parallel
-;       {
-;           a = 3;
-;       }
-;   }
-;
-; Merge all parallel regions.
-define dso_local void @merge_all() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+define dso_local void @merge(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
   ret void
 }
 
-define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 3, i32* %2, align 4
+define internal void @.omp_outlined.(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
+declare dso_local void @use(i32) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+define internal void @.omp_outlined..1(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
+define dso_local void @unmergable_proc_bind(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  store i32 %a, i32* %a.addr, align 4
+  call void @__kmpc_push_proc_bind(%struct.ident_t* nonnull @1, i32 %0, i32 3)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..2(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
 
 declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
 
-declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+declare void @__kmpc_push_proc_bind(%struct.ident_t*, i32, i32) local_unnamed_addr
 
-;   void merge_none() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;       a = 3;
-;   #pragma omp parallel
-;       {
-;           a = 4;
-;       }
-;   }
-;
-; Does not merge parallel regions, in-between store
-; instruction is unsafe to execute in parallel.
-define dso_local void @merge_none() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  store i32 3, i32* %1, align 4
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+define internal void @.omp_outlined..3(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 4, i32* %2, align 4
+define dso_local void @unmergable_num_threads(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  store i32 %a, i32* %a.addr, align 4
+  call void @__kmpc_push_num_threads(%struct.ident_t* nonnull @1, i32 %0, i32 %a)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
   ret void
 }
 
-define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
+define internal void @.omp_outlined..4(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-;   void merge_some() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;       a = 3;
-;   #pragma omp parallel
-;       {
-;           a = 4;
-;       }
-;   #pragma omp parallel
-;       {
-;           a = 5;
-;       }
-;   }
-;
-; Do not merge first parallel region, due to the
-; unsafe store, but merge the two next parallel
-; regions.
-define dso_local void @merge_some() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  store i32 3, i32* %1, align 4
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
-  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+declare void @__kmpc_push_num_threads(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+define internal void @.omp_outlined..5(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 5, i32* %2, align 4
+define dso_local void @unmergable_seq_call(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (...) @foo()
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..7 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
   ret void
 }
 
-define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 4, i32* %2, align 4
+define internal void @.omp_outlined..6(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
+declare dso_local void @foo(...) local_unnamed_addr
+
+define internal void @.omp_outlined..7(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-;   void merge_cancellable_regions(int cancel1, int cancel2)
-;   {
-;   #pragma omp parallel
-;       {
-;           if(cancel1) {
-;   #pragma omp cancel parallel
-;           }
-;       }
-;   #pragma omp parallel
-;       {
-;           if (cancel2) {
-;   #pragma omp cancel parallel
-;           }
-;       }
-;   }
-;
-; Merge correctly cancellable regions.
-define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr  {
-  %3 = alloca i32, align 4
-  %4 = alloca i32, align 4
-  store i32 %0, i32* %3, align 4
-  store i32 %1, i32* %4, align 4
-  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
-  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+define dso_local void @merge_seq(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..8 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..9 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %1 = load i32, i32* %a.addr, align 4
+  call void @use(i32 %1)
+  ret void
+}
+
+define internal void @.omp_outlined..8(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..9(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @merge_seq_float(float %f, float* nocapture %p) local_unnamed_addr  {
+entry:
+  %f.addr = alloca float, align 4
+  store float %f, float* %f.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*)* @.omp_outlined..10 to void (i32*, i32*, ...)*), float* nonnull %f.addr)
+  %0 = load float, float* %f.addr, align 4
+  %add = fadd float %0, 0x40091EB860000000
+  store float %add, float* %p, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*)* @.omp_outlined..11 to void (i32*, i32*, ...)*), float* nonnull %f.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..10(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., float* nocapture nonnull readonly align 4 dereferenceable(4) %f)  {
+entry:
+  %0 = load float, float* %f, align 4
+  %conv = fptosi float %0 to i32
+  call void @use(i32 %conv)
+  ret void
+}
+
+define internal void @.omp_outlined..11(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., float* nocapture nonnull readonly align 4 dereferenceable(4) %f)  {
+entry:
+  %0 = load float, float* %f, align 4
+  %conv = fptosi float %0 to i32
+  call void @use(i32 %conv)
   ret void
 }
 
-define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
-  %4 = load i32, i32* %2, align 4
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %7
+define dso_local void @merge_seq_firstprivate(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..12 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a.addr, align 4
+  %a.casted.sroa.0.0.insert.ext = zext i32 %add to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64)* @.omp_outlined..13 to void (i32*, i32*, ...)*), i64 %a.casted.sroa.0.0.insert.ext)
+  %1 = load i32, i32* %a.addr, align 4
+  call void @use(i32 %1)
+  ret void
+}
+
+define internal void @.omp_outlined..12(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..13(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %a)  {
+entry:
+  %a.addr.sroa.0.0.extract.trunc = trunc i64 %a to i32
+  call void @use(i32 %a.addr.sroa.0.0.extract.trunc)
+  ret void
+}
+
+define dso_local void @merge_seq_sink_lt(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..14 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = bitcast i32* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  %1 = ptrtoint i32* %b to i64
+  %2 = trunc i64 %1 to i32
+  store i32 %2, i32* %b, align 4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..15 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..14(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
 
-6:                                                ; preds = %3
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+
+define internal void @.omp_outlined..15(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
+}
 
-7:                                                ; preds = %3
-  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+define dso_local void @merge_seq_par_use(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..16 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = bitcast i32* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  %1 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %1, 1
+  store i32 %add, i32* %b, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..17 to void (i32*, i32*, ...)*), i32* nonnull %a.addr, i32* nonnull %b)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
   ret void
 }
 
-define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
-  %4 = load i32, i32* %2, align 4
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %7
+define internal void @.omp_outlined..16(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
 
-6:                                                ; preds = %3
+define internal void @.omp_outlined..17(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a, i32* nocapture nonnull readonly align 4 dereferenceable(4) %b)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  %1 = load i32, i32* %b, align 4
+  call void @use(i32 %1)
   ret void
+}
 
-7:                                                ; preds = %3
-  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+define dso_local void @merge_cancellable_regions(i32 %cancel1, i32 %cancel2) local_unnamed_addr  {
+entry:
+  %cancel1.addr = alloca i32, align 4
+  %cancel2.addr = alloca i32, align 4
+  store i32 %cancel1, i32* %cancel1.addr, align 4
+  store i32 %cancel2, i32* %cancel2.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..18 to void (i32*, i32*, ...)*), i32* nonnull %cancel1.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..19 to void (i32*, i32*, ...)*), i32* nonnull %cancel2.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..18(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel1)  {
+entry:
+  %0 = load i32, i32* %cancel1, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %.global_tid., align 4
+  %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+  ret void
+
+if.end:                                           ; preds = %entry
   ret void
 }
 
 declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
 
+define internal void @.omp_outlined..19(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel2)  {
+entry:
+  %0 = load i32, i32* %cancel2, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %.global_tid., align 4
+  %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+define dso_local void @merge_cancellable_regions_seq(i32 %cancel1, i32 %cancel2) local_unnamed_addr  {
+entry:
+  %cancel1.addr = alloca i32, align 4
+  %cancel2.addr = alloca i32, align 4
+  store i32 %cancel1, i32* %cancel1.addr, align 4
+  store i32 %cancel2, i32* %cancel2.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..20 to void (i32*, i32*, ...)*), i32* nonnull %cancel1.addr)
+  %0 = load i32, i32* %cancel1.addr, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  %lnot.ext = zext i1 %tobool.not to i32
+  store i32 %lnot.ext, i32* %cancel2.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..21 to void (i32*, i32*, ...)*), i32* nonnull %cancel2.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..20(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel1)  {
+entry:
+  %0 = load i32, i32* %cancel1, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %.global_tid., align 4
+  %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+define internal void @.omp_outlined..21(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel2)  {
+entry:
+  %0 = load i32, i32* %cancel2, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %.global_tid., align 4
+  %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+define dso_local void @merge_3(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..22 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..23 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..24 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..22(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..23(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..24(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @merge_3_seq(i32 %a, i32 %b) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..25 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..26 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %1 = load i32, i32* %a.addr, align 4
+  %add1 = add nsw i32 %add, %1
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..27 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void @use(i32 %add1)
+  ret void
+}
+
+define internal void @.omp_outlined..25(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..26(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..27(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @unmergable_3_seq_call(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..28 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (...) @foo()
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..29 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (...) @foo()
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..30 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..28(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..29(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..30(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @unmergable_3_proc_bind(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..31 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void @__kmpc_push_proc_bind(%struct.ident_t* nonnull @1, i32 %0, i32 3)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..32 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..33 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..31(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..32(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..33(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @unmergable_3_num_threads(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..34 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %1 = load i32, i32* %a.addr, align 4
+  call void @__kmpc_push_num_threads(%struct.ident_t* nonnull @1, i32 %0, i32 %1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..35 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..36 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..34(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..35(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..36(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @merge_2_unmergable_1(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..37 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..38 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (...) @foo()
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..39 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..37(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..38(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..39(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
 
 !llvm.module.flags = !{!0}
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{!2}
 !2 = !{i64 2, i64 -1, i64 -1, i1 true}
-; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-LABEL: define {{[^@]+}}@merge
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
 ; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK:       omp.par.outlined.exit:
 ; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 ; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.2
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-LABEL: define {{[^@]+}}@merge..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0:#.*]] {
 ; CHECK-NEXT:  omp.par.entry:
 ; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
 ; CHECK:       omp.par.outlined.exit.exitStub:
@@ -228,12 +818,12 @@
 ; CHECK:       omp.par.region:
 ; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    call void @.omp_outlined.(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 ; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
+; CHECK-NEXT:    call void @.omp_outlined..1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
 ; CHECK:       omp.par.region.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
@@ -241,65 +831,214 @@
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_proc_bind
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @__kmpc_push_proc_bind(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 noundef 3)
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_num_threads
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @__kmpc_push_num_threads(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 [[A]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-LABEL: define {{[^@]+}}@unmergable_seq_call
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (...) @foo()
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..7 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..7
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@merge_seq
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..8(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..9(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..8
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..9
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_float
+; CHECK-SAME: (float [[F:%.*]], float* nocapture writeonly [[P:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P_SEQ_INPUT_ALLOC:%.*]] = alloca float*, align 8
+; CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
+; CHECK-NEXT:    store float [[F]], float* [[F_ADDR]], align 4
+; CHECK-NEXT:    store float* [[P]], float** [[P_SEQ_INPUT_ALLOC]], align 8
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 ; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.5 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*, float**)* @merge_seq_float..omp_par to void (i32*, i32*, ...)*), float* [[F_ADDR]], float** [[P_SEQ_INPUT_ALLOC]])
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK:       omp.par.outlined.exit:
 ; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 ; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.5
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_float..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], float* [[F_ADDR:%.*]], float** [[P_SEQ_INPUT_ALLOC:%.*]]) [[ATTR0]] {
 ; CHECK-NEXT:  omp.par.entry:
 ; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
 ; CHECK:       omp.par.outlined.exit.exitStub:
@@ -307,63 +1046,346 @@
 ; CHECK:       omp.par.region:
 ; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    call void @.omp_outlined..10(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..11(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
 ; CHECK:       omp.par.region.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 ; CHECK:       omp.par.pre_finalize:
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[P_SEQ_INPUT_LOAD:%.*]] = load float*, float** [[P_SEQ_INPUT_ALLOC]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[F_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], 0x40091EB860000000
+; CHECK-NEXT:    store float [[ADD]], float* [[P_SEQ_INPUT_LOAD]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 5, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..10
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[F]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[CONV]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..11
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[F]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[CONV]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_firstprivate
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64*)* @merge_seq_firstprivate..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_firstprivate..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..12(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_LOAD:%.*]] = load i64, i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]], align 8
+; CHECK-NEXT:    call void @.omp_outlined..13(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i64 [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_LOAD]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    store i64 [[A_CASTED_SROA_0_0_INSERT_EXT]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]], align 8
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..12
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..13
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    call void @use(i32 [[A_ADDR_SROA_0_0_EXTRACT_TRUNC]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_sink_lt
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq_sink_lt..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_sink_lt..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..14(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..15(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* noundef nonnull [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint i32* [[B]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[B]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* noundef nonnull [[TMP3]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..14
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..15
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_par_use
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_seq_par_use..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i32* [[B]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* noundef nonnull [[LT_CAST]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_par_use..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i32* [[B:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..16(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..17(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[B]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..16
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..17
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
-; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TMP4]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-SAME: (i32 [[CANCEL1:%.*]], i32 [[CANCEL2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CANCEL1_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 ; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.6 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK:       omp.par.outlined.exit:
 ; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 ; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.6
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[CANCEL1_ADDR:%.*]], i32* [[CANCEL2_ADDR:%.*]]) [[ATTR0]] {
 ; CHECK-NEXT:  omp.par.entry:
 ; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
 ; CHECK:       omp.par.outlined.exit.exitStub:
@@ -371,12 +1393,12 @@
 ; CHECK:       omp.par.region:
 ; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    call void @.omp_outlined..18(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 ; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
+; CHECK-NEXT:    call void @.omp_outlined..19(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
 ; CHECK:       omp.par.region.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
@@ -384,28 +1406,497 @@
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..18
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CANCEL1]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
 ; CHECK-NEXT:    ret void
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
-; CHECK-NEXT:    ret void
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..19
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CANCEL2]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions_seq
+; CHECK-SAME: (i32 [[CANCEL1:%.*]], i32 [[CANCEL2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CANCEL1_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions_seq..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[CANCEL1_ADDR:%.*]], i32* [[CANCEL2_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..20(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..21(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[TOBOOL_NOT]] to i32
+; CHECK-NEXT:    store i32 [[LNOT_EXT]], i32* [[CANCEL2_ADDR]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..20
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CANCEL1]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..21
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CANCEL2]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_3..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..22(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @.omp_outlined..23(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    call void @.omp_outlined..24(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..22
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..23
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..24
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3_seq
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD1_SEQ_OUTPUT_ALLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_SEQ_OUTPUT_ALLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_3_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    [[ADD1_SEQ_OUTPUT_LOAD:%.*]] = load i32, i32* [[ADD1_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT:    call void @use(i32 [[ADD1_SEQ_OUTPUT_LOAD]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i32* [[ADD_SEQ_OUTPUT_ALLOC:%.*]], i32* [[ADD1_SEQ_OUTPUT_ALLOC:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..25(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..26(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[OMP_REGION_BODY5:%.*]], label [[OMP_REGION_END4:%.*]]
+; CHECK:       omp_region.end4:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..27(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body5:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED2:%.*]]
+; CHECK:       seq.par.merged2:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD_SEQ_OUTPUT_LOAD:%.*]] = load i32, i32* [[ADD_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD_SEQ_OUTPUT_LOAD]], [[TMP5]]
+; CHECK-NEXT:    store i32 [[ADD1]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY5_SPLIT:%.*]]
+; CHECK:       omp_region.body5.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK-NEXT:    br label [[OMP_REGION_END4]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..25
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..26
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..27
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_seq_call
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..28 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (...) @foo()
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..29 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (...) @foo()
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..30 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..28
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..29
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..30
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_proc_bind
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..31 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void @__kmpc_push_proc_bind(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 noundef 3)
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..32 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..33 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..31
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..32
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..33
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_num_threads
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..34 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @__kmpc_push_num_threads(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..35 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..36 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..34
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..35
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..36
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_2_unmergable_1
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_2_unmergable_1..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    call void (...) @foo()
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..39 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_2_unmergable_1..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..37(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @.omp_outlined..38(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..37
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..38
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..39
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll
deleted file mode 100644
--- a/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll
+++ /dev/null
@@ -1,412 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
-; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging  < %s | FileCheck %s
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-
-%struct.ident_t = type { i32, i32, i32, i32, i8* }
-
-@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
-@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
-
-;   void merge_all() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;   #pragma omp parallel
-;       {
-;           a = 3;
-;       }
-;   }
-;
-; Merge all parallel regions.
-define dso_local void @merge_all() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
-  ret void
-}
-
-define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 3, i32* %2, align 4
-  ret void
-}
-
-define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
-  ret void
-}
-
-
-declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
-
-declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
-
-;   void merge_none() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;       a = 3;
-;   #pragma omp parallel
-;       {
-;           a = 4;
-;       }
-;   }
-;
-; Does not merge parallel regions, in-between store
-; instruction is unsafe to execute in parallel.
-define dso_local void @merge_none() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  store i32 3, i32* %1, align 4
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
-  ret void
-}
-
-define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 4, i32* %2, align 4
-  ret void
-}
-
-define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
-  ret void
-}
-
-;   void merge_some() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;       a = 3;
-;   #pragma omp parallel
-;       {
-;           a = 4;
-;       }
-;   #pragma omp parallel
-;       {
-;           a = 5;
-;       }
-;   }
-;
-; Do not merge first parallel region, due to the
-; unsafe store, but merge the two next parallel
-; regions.
-define dso_local void @merge_some() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  store i32 3, i32* %1, align 4
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
-  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
-  ret void
-}
-
-define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 5, i32* %2, align 4
-  ret void
-}
-
-define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 4, i32* %2, align 4
-  ret void
-}
-
-define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
-  ret void
-}
-
-;   void merge_cancellable_regions(int cancel1, int cancel2)
-;   {
-;   #pragma omp parallel
-;       {
-;           if(cancel1) {
-;   #pragma omp cancel parallel
-;           }
-;       }
-;   
-;   #pragma omp parallel
-;       {
-;           if (cancel2) {
-;   #pragma omp cancel parallel
-;           }
-;       }
-;   }
-;
-; Merge correctly cancellable regions.
-define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr  {
-  %3 = alloca i32, align 4
-  %4 = alloca i32, align 4
-  store i32 %0, i32* %3, align 4
-  store i32 %1, i32* %4, align 4
-  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
-  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
-  ret void
-}
-
-define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
-  %4 = load i32, i32* %2, align 4
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %7
-
-6:                                                ; preds = %3
-  ret void
-
-7:                                                ; preds = %3
-  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
-  ret void
-}
-
-define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
-  %4 = load i32, i32* %2, align 4
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %7
-
-6:                                                ; preds = %3
-  ret void
-
-7:                                                ; preds = %3
-  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
-  ret void
-}
-
-declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
-
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{!2}
-!2 = !{i64 2, i64 -1, i64 -1, i1 true}
-; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
-; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
-; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
-; CHECK:       omp.par.outlined.exit:
-; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
-; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.2
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
-; CHECK-NEXT:  omp.par.entry:
-; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
-; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
-; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
-; CHECK:       omp.par.outlined.exit.exitStub:
-; CHECK-NEXT:    ret void
-; CHECK:       omp.par.region:
-; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
-; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
-; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
-; CHECK:       omp.par.region.split:
-; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
-; CHECK:       omp.par.pre_finalize:
-; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
-; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.5 to void (i32*, i32*, ...)*), i32* [[TMP2]])
-; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
-; CHECK:       omp.par.outlined.exit:
-; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
-; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.5
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
-; CHECK-NEXT:  omp.par.entry:
-; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
-; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
-; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
-; CHECK:       omp.par.outlined.exit.exitStub:
-; CHECK-NEXT:    ret void
-; CHECK:       omp.par.region:
-; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
-; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
-; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
-; CHECK:       omp.par.region.split:
-; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
-; CHECK:       omp.par.pre_finalize:
-; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 5, i32* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
-; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TMP4]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
-; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
-; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.6 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
-; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
-; CHECK:       omp.par.outlined.exit:
-; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
-; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.6
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
-; CHECK-NEXT:  omp.par.entry:
-; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
-; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
-; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
-; CHECK:       omp.par.outlined.exit.exitStub:
-; CHECK-NEXT:    ret void
-; CHECK:       omp.par.region:
-; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
-; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
-; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
-; CHECK:       omp.par.region.split:
-; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
-; CHECK:       omp.par.pre_finalize:
-; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
-; CHECK-NEXT:    ret void
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
-; CHECK-NEXT:    ret void
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
-; CHECK-NEXT:    ret void
-;