diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -111,10 +111,12 @@
 /// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
 ///       To complete this, more enum values will need to be moved here.
 enum class OMPScheduleType {
-  Static = 34, /**< static unspecialized */
+  StaticChunked = 33,
+  Static = 34,
+  DistributeChunked = 91,
+  Distribute = 92,
   DynamicChunked = 35,
-  ModifierNonmonotonic =
-      (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
+  ModifierNonmonotonic = (1 << 30),
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierNonmonotonic)
 };
 
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -68,6 +68,9 @@
           "Number of OpenMP runtime function uses identified");
 STATISTIC(NumOpenMPTargetRegionKernels,
           "Number of OpenMP target region entry points (=kernels) identified");
+STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
+          "Number of OpenMP target region entry points (=kernels) executed in "
+          "SPMD-mode instead of generic-mode");
 STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachine,
           "Number of OpenMP target region entry points (=kernels) executed in "
           "generic-mode with customized state machines");
@@ -456,6 +459,9 @@
   /// in the ParallelRegions set above.
   bool MayReachUnknownParallelRegion = false;
 
+  /// Flag to indicate if the associated function can be executed in SPMD mode.
+  bool IsSPMDCompatible = true;
+
   /// Abstract State interface
   ///{
 
@@ -474,6 +480,7 @@
   /// See AbstractState::indicatePessimisticFixpoint(...)
   ChangeStatus indicatePessimisticFixpoint() override {
     IsAtFixpoint = true;
+    IsSPMDCompatible = false;
     MayReachUnknownParallelRegion = true;
     return ChangeStatus::CHANGED;
   }
@@ -489,7 +496,8 @@
   const KernelInfoState &getAssumed() const { return *this; }
 
   bool operator==(const KernelInfoState &RHS) const {
-    if ((MayReachUnknownParallelRegion != RHS.MayReachUnknownParallelRegion))
+    if ((MayReachUnknownParallelRegion != RHS.MayReachUnknownParallelRegion) |
+        (IsSPMDCompatible != RHS.IsSPMDCompatible))
       return false;
     return ParallelRegions.size() == RHS.ParallelRegions.size();
   }
@@ -511,6 +519,7 @@
     if (KIS.KernelInitCB || KIS.KernelDeinitCB)
       indicatePessimisticFixpoint();
     MayReachUnknownParallelRegion |= KIS.MayReachUnknownParallelRegion;
+    IsSPMDCompatible &= KIS.IsSPMDCompatible;
     ParallelRegions.insert(KIS.ParallelRegions.begin(),
                            KIS.ParallelRegions.end());
     return *this;
@@ -548,6 +557,7 @@
 
     const int InitIsSPMDArgNo = 1;
     const int InitUseStateMachineArgNo = 2;
+    const int DeinitIsSPMDArgNo = 1;
 
     // Check if the current configuration is non-SPMD and generic state machine.
     // If we already have SPMD mode or a custom state machine we do not need to
@@ -558,6 +568,22 @@
     ConstantInt *IsSPMD =
         dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
 
+    auto &Ctx = getAnchorValue().getContext();
+    // First check if we can go to SPMD-mode, that is the best option.
+    if (canBeExecutedInSPMDMode() && IsSPMD && IsSPMD->isZero()) {
+      // Indicate we use SPMD mode now.
+      A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo),
+                               *ConstantInt::getBool(Ctx, 1));
+      A.changeUseAfterManifest(
+          KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
+          *ConstantInt::getBool(Ctx, 0));
+      A.changeUseAfterManifest(
+          KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo),
+          *ConstantInt::getBool(Ctx, 1));
+      ++NumOpenMPTargetRegionKernelsSPMD;
+      return ChangeStatus::CHANGED;
+    }
+
     // If we are stuck with generic mode, try to create a custom device (=GPU)
     // state machine which is specialized for the parallel regions that are
     // reachable by the kernel.
@@ -566,8 +592,7 @@
       return ChangeStatus::UNCHANGED;
     }
 
-    // First, indicate we use a custom state machine now.
-    auto &Ctx = getAnchorValue().getContext();
+    // If not SPMD mode, indicate we use a custom state machine now.
     auto *FalseVal = ConstantInt::getBool(Ctx, 0);
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
@@ -737,9 +762,13 @@
   /// Statistics are tracked as part of manifest for now.
   void trackStatistics() const override {}
 
+  /// Returns true if value is assumed to be tracked.
+  bool canBeExecutedInSPMDMode() const { return IsSPMDCompatible; }
+
   /// See AbstractAttribute::getAsStr()
   const std::string getAsStr() const override {
-    return std::string("#PR: ") + std::to_string(ParallelRegions.size()) +
+    return std::string(canBeExecutedInSPMDMode() ? "SPMD" : "generic") +
+           std::string(" | #PR: ") + std::to_string(ParallelRegions.size()) +
            (MayReachUnknownParallelRegion ? " + Unknown PR" : "");
   }
 
@@ -771,6 +800,29 @@
   ChangeStatus updateImpl(Attributor &A) override {
     KernelInfoState StateBefore = getState();
 
+    // Callback to check a read/write instruction.
+    auto CheckRWInst = [&](Instruction &I) {
+      // We handle calls later.
+      if (isa<CallBase>(I))
+        return true;
+      // We only care about write effects.
+      if (!I.mayWriteToMemory())
+        return true;
+      if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        SmallVector<const Value *> Objects;
+        getUnderlyingObjects(SI->getPointerOperand(), Objects);
+        if (llvm::all_of(Objects,
+                         [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
+          return true;
+      }
+      // For now we give up on everything but stores.
+      IsSPMDCompatible = false;
+      return true;
+    };
+    if (IsSPMDCompatible &&
+        !A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+      IsSPMDCompatible = false;
+
     // Callback to check a call instruction.
     auto CheckCallInst = [&](Instruction &I) {
       auto &CB = cast<CallBase>(I);
@@ -807,6 +859,7 @@
         }
         // The callee is not known, not ipo-amendable (e.g., due to linkage), or
         // we can for some other reason not analyze it -> worst case.
+        IsSPMDCompatible = false;
         MayReachUnknownParallelRegion = true;
         return true;
       }
@@ -814,6 +867,15 @@
       const unsigned int WrapperFunctionArgNo = 6;
       RuntimeFunction RF = It->getSecond();
       switch (RF) {
+      // All the functions we know are compatible with SPMD mode.
+      case OMPRTL___kmpc_for_static_fini:
+      case OMPRTL___kmpc_global_thread_num:
+      case OMPRTL___kmpc_single:
+      case OMPRTL___kmpc_end_single:
+      case OMPRTL___kmpc_master:
+      case OMPRTL___kmpc_end_master:
+      case OMPRTL___kmpc_barrier:
+        return true;
       case OMPRTL___kmpc_target_init:
         // The only reason we completely give up is when we see a second init or
         // deinit call.
@@ -839,15 +901,39 @@
         // worst.
         MayReachUnknownParallelRegion = true;
         return true;
+      case OMPRTL___kmpc_for_static_init_4:
+      case OMPRTL___kmpc_for_static_init_4u:
+      case OMPRTL___kmpc_for_static_init_8:
+      case OMPRTL___kmpc_for_static_init_8u: {
+        // Check the schedule and allow static schedule in SPMD mode.
+        unsigned ScheduleArgOpNo = 2;
+        auto *ScheduleTypeCI =
+            dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
+        unsigned ScheduleTypeVal =
+            ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
+        switch (OMPScheduleType(ScheduleTypeVal)) {
+        case OMPScheduleType::Static:
+        case OMPScheduleType::StaticChunked:
+        case OMPScheduleType::Distribute:
+        case OMPScheduleType::DistributeChunked:
+          return true;
+        default:
+          break;
+        };
+        IsSPMDCompatible = false;
+        return true;
+      }
       case OMPRTL___kmpc_omp_task:
         // We do not look into tasks right now, just give up.
         MayReachUnknownParallelRegion = true;
+        IsSPMDCompatible = false;
         return true;
       default:
         break;
       }
-      // All other OpenMP runtime calls will not reach parallel regions so they
-      // can be safely ignored for now.
+      // All other OpenMP runtime calls will not reach parallel regions but they
+      // might not be compatible with SPMD mode.
+      IsSPMDCompatible = false;
       return true;
     };
     if (!A.checkForAllCallLikeInstructions(CheckCallInst, *this))
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -850,56 +850,21 @@
   ret void
 }
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed.
+; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode.
 define weak void @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61() #0 {
 ; CHECK-LABEL: @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]]
-; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; CHECK:       worker_state_machine.begin:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
-; CHECK-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
-; CHECK-NEXT:    [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
-; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
-; CHECK-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
-; CHECK-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; CHECK:       worker_state_machine.finished:
-; CHECK-NEXT:    ret void
-; CHECK:       worker_state_machine.is_active.check:
-; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__10_wrapper
-; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; CHECK:       worker_state_machine.parallel_region.execute:
-; CHECK-NEXT:    call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]])
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; CHECK:       worker_state_machine.parallel_region.check1:
-; CHECK-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]]
-; CHECK:       worker_state_machine.parallel_region.execute2:
-; CHECK-NEXT:    call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]])
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; CHECK:       worker_state_machine.parallel_region.check3:
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; CHECK:       worker_state_machine.parallel_region.end:
-; CHECK-NEXT:    call void @__kmpc_kernel_end_parallel()
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; CHECK:       worker_state_machine.done.barrier:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; CHECK:       thread.user_code.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) #[[ATTR8]]
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]]
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
 ; CHECK-NEXT:    call void @__omp_outlined__9(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]]
-; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) #[[ATTR8]]
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
@@ -1058,56 +1023,21 @@
   ret void
 }
 
-; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine.
-; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed.
+; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode.
 define weak void @__omp_offloading_2c_389eb_simple_state_machine_pure_l72() #0 {
 ; CHECK-LABEL: @__omp_offloading_2c_389eb_simple_state_machine_pure_l72(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]]
-; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; CHECK:       worker_state_machine.begin:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
-; CHECK-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
-; CHECK-NEXT:    [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
-; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
-; CHECK-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
-; CHECK-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; CHECK:       worker_state_machine.finished:
-; CHECK-NEXT:    ret void
-; CHECK:       worker_state_machine.is_active.check:
-; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__13_wrapper
-; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; CHECK:       worker_state_machine.parallel_region.execute:
-; CHECK-NEXT:    call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]])
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; CHECK:       worker_state_machine.parallel_region.check1:
-; CHECK-NEXT:    br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]]
-; CHECK:       worker_state_machine.parallel_region.execute2:
-; CHECK-NEXT:    call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]])
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; CHECK:       worker_state_machine.parallel_region.check3:
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; CHECK:       worker_state_machine.parallel_region.end:
-; CHECK-NEXT:    call void @__kmpc_kernel_end_parallel()
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; CHECK:       worker_state_machine.done.barrier:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
-; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; CHECK:       thread.user_code.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) #[[ATTR8]]
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]]
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
 ; CHECK-NEXT:    call void @__omp_outlined__12(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]]
-; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) #[[ATTR8]]
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/OpenMP/spdm_mode.ll b/llvm/test/Transforms/OpenMP/spdm_mode.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/spdm_mode.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+
+;; void unknown(void);
+;;
+;; void sequential_loop() {
+;;   #pragma omp target teams
+;;   {
+;;     for (int i = 0; i < 100; ++i) {
+;;       #pragma omp parallel
+;;       {
+;;         unknown();
+;;       }
+;;     }
+;;   }
+;; }
+
+target triple = "nvptx64"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* undef
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+@__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata"
+
+; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode.
+define weak void @__omp_offloading_2c_38c77_sequential_loop_l4() #0 {
+; CHECK-LABEL: @__omp_offloading_2c_38c77_sequential_loop_l4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+; CHECK:       user_code.entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTTHREADID_TEMP_]], align 4
+; CHECK-NEXT:    call void @__omp_outlined__(i32 [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) #[[ATTR3]]
+; CHECK-NEXT:    ret void
+; CHECK:       worker.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %worker.exit
+
+user_code.entry:                                  ; preds = %entry
+  %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %1, i32* %.threadid_temp., align 4
+  call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #2
+  call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
+  ret void
+
+worker.exit:                                      ; preds = %entry
+  ret void
+}
+
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
+; CHECK-LABEL: @__omp_outlined__(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTGLOBAL_TID__PRIV:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[TMP0:%.*]], i32* [[DOTGLOBAL_TID__PRIV]], align 4
+; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID__PRIV]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) #[[ATTR3]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %captured_vars_addrs = alloca [0 x i8*], align 8
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = load i32, i32* %.global_tid., align 4
+  %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
+  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %1, i64 0)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond, !llvm.loop !6
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
+; CHECK-LABEL: @__omp_outlined__1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @unknown() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @unknown() #3
+  ret void
+}
+
+; Function Attrs: convergent
+declare void @unknown() #1
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #0 {
+; CHECK-LABEL: @__omp_outlined__1_wrapper(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
+; CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1:%.*]], i32* [[DOTADDR1]], align 4
+; CHECK-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
+; CHECK-NEXT:    call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.addr1 = alloca i32, align 4
+  %.zero.addr = alloca i32, align 4
+  %global_args = alloca i8**, align 8
+  store i32 0, i32* %.zero.addr, align 4
+  store i32 %1, i32* %.addr1, align 4
+  call void @__kmpc_get_shared_variables(i8*** %global_args)
+  call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) #2
+  ret void
+}
+
+declare void @__kmpc_get_shared_variables(i8***)
+
+declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
+
+; Function Attrs: nounwind
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #2
+
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
+
+attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #2 = { nounwind }
+attributes #3 = { convergent }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 0, i32 44, i32 232567, !"sequential_loop", i32 4, i32 0}
+!1 = !{void ()* @__omp_offloading_2c_38c77_sequential_loop_l4, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 13.0.0"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}