diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -3340,9 +3340,6 @@
   }
 
   bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
-    if (!mayContainParallelRegion())
-      return false;
-
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
 
     if (!SPMDCompatibilityTracker.isAssumed()) {
@@ -3401,222 +3398,276 @@
     // We will now unconditionally modify the IR, indicate a change.
     Changed = ChangeStatus::CHANGED;
 
-    auto CreateGuardedRegion = [&](Instruction *RegionStartI,
-                                   Instruction *RegionEndI) {
-      LoopInfo *LI = nullptr;
-      DominatorTree *DT = nullptr;
-      MemorySSAUpdater *MSU = nullptr;
-      using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
-
-      BasicBlock *ParentBB = RegionStartI->getParent();
-      Function *Fn = ParentBB->getParent();
-      Module &M = *Fn->getParent();
-
-      // Create all the blocks and logic.
-      // ParentBB:
-      //    goto RegionCheckTidBB
-      // RegionCheckTidBB:
-      //    Tid = __kmpc_hardware_thread_id()
-      //    if (Tid != 0)
-      //        goto RegionBarrierBB
-      // RegionStartBB:
-      //    <execute instructions guarded>
-      //    goto RegionEndBB
-      // RegionEndBB:
-      //    <store escaping values to shared mem>
-      //    goto RegionBarrierBB
-      //  RegionBarrierBB:
-      //    __kmpc_simple_barrier_spmd()
-      //    // second barrier is omitted if lacking escaping values.
-      //    <load escaping values from shared mem>
-      //    __kmpc_simple_barrier_spmd()
-      //    goto RegionExitBB
-      // RegionExitBB:
-      //    <execute rest of instructions>
-
-      BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
-                                           DT, LI, MSU, "region.guarded.end");
-      BasicBlock *RegionBarrierBB =
-          SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
-                     MSU, "region.barrier");
-      BasicBlock *RegionExitBB =
-          SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
-                     DT, LI, MSU, "region.exit");
-      BasicBlock *RegionStartBB =
-          SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
-
-      assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
-             "Expected a different CFG");
+    auto &Ctx = getAnchorValue().getContext();
 
-      BasicBlock *RegionCheckTidBB = SplitBlock(
-          ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
+    if (mayContainParallelRegion()) {
+      auto CreateGuardedRegion = [&](Instruction *RegionStartI,
+                                     Instruction *RegionEndI) {
+        LoopInfo *LI = nullptr;
+        DominatorTree *DT = nullptr;
+        MemorySSAUpdater *MSU = nullptr;
+        using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+        BasicBlock *ParentBB = RegionStartI->getParent();
+        Function *Fn = ParentBB->getParent();
+        Module &M = *Fn->getParent();
+
+        // Create all the blocks and logic.
+        // ParentBB:
+        //    goto RegionCheckTidBB
+        // RegionCheckTidBB:
+        //    Tid = __kmpc_hardware_thread_id()
+        //    if (Tid != 0)
+        //        goto RegionBarrierBB
+        // RegionStartBB:
+        //    <execute instructions guarded>
+        //    goto RegionEndBB
+        // RegionEndBB:
+        //    <store escaping values to shared mem>
+        //    goto RegionBarrierBB
+        //  RegionBarrierBB:
+        //    __kmpc_simple_barrier_spmd()
+        //    // second barrier is omitted if lacking escaping values.
+        //    <load escaping values from shared mem>
+        //    __kmpc_simple_barrier_spmd()
+        //    goto RegionExitBB
+        // RegionExitBB:
+        //    <execute rest of instructions>
+
+        BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
+                                             DT, LI, MSU, "region.guarded.end");
+        BasicBlock *RegionBarrierBB =
+            SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
+                       MSU, "region.barrier");
+        BasicBlock *RegionExitBB =
+            SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
+                       DT, LI, MSU, "region.exit");
+        BasicBlock *RegionStartBB =
+            SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
+
+        assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
+               "Expected a different CFG");
+
+        BasicBlock *RegionCheckTidBB = SplitBlock(
+            ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
+
+        // Register basic blocks with the Attributor.
+        A.registerManifestAddedBasicBlock(*RegionEndBB);
+        A.registerManifestAddedBasicBlock(*RegionBarrierBB);
+        A.registerManifestAddedBasicBlock(*RegionExitBB);
+        A.registerManifestAddedBasicBlock(*RegionStartBB);
+        A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
+
+        bool HasBroadcastValues = false;
+        // Find escaping outputs from the guarded region to outside users and
+        // broadcast their values to them.
+        for (Instruction &I : *RegionStartBB) {
+          SmallPtrSet<Instruction *, 4> OutsideUsers;
+          for (User *Usr : I.users()) {
+            Instruction &UsrI = *cast<Instruction>(Usr);
+            if (UsrI.getParent() != RegionStartBB)
+              OutsideUsers.insert(&UsrI);
+          }
 
-      // Register basic blocks with the Attributor.
-      A.registerManifestAddedBasicBlock(*RegionEndBB);
-      A.registerManifestAddedBasicBlock(*RegionBarrierBB);
-      A.registerManifestAddedBasicBlock(*RegionExitBB);
-      A.registerManifestAddedBasicBlock(*RegionStartBB);
-      A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
+          if (OutsideUsers.empty())
+            continue;
 
-      bool HasBroadcastValues = false;
-      // Find escaping outputs from the guarded region to outside users and
-      // broadcast their values to them.
-      for (Instruction &I : *RegionStartBB) {
-        SmallPtrSet<Instruction *, 4> OutsideUsers;
-        for (User *Usr : I.users()) {
-          Instruction &UsrI = *cast<Instruction>(Usr);
-          if (UsrI.getParent() != RegionStartBB)
-            OutsideUsers.insert(&UsrI);
-        }
+          HasBroadcastValues = true;
 
-        if (OutsideUsers.empty())
-          continue;
+          // Emit a global variable in shared memory to store the broadcasted
+          // value.
+          auto *SharedMem = new GlobalVariable(
+              M, I.getType(), /* IsConstant */ false,
+              GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
+              sanitizeForGlobalName(
+                  (I.getName() + ".guarded.output.alloc").str()),
+              nullptr, GlobalValue::NotThreadLocal,
+              static_cast<unsigned>(AddressSpace::Shared));
 
-        HasBroadcastValues = true;
+          // Emit a store instruction to update the value.
+          new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
 
-        // Emit a global variable in shared memory to store the broadcasted
-        // value.
-        auto *SharedMem = new GlobalVariable(
-            M, I.getType(), /* IsConstant */ false,
-            GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
-            sanitizeForGlobalName(
-                (I.getName() + ".guarded.output.alloc").str()),
-            nullptr, GlobalValue::NotThreadLocal,
-            static_cast<unsigned>(AddressSpace::Shared));
-
-        // Emit a store instruction to update the value.
-        new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
-
-        LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
-                                       I.getName() + ".guarded.output.load",
-                                       RegionBarrierBB->getTerminator());
-
-        // Emit a load instruction and replace uses of the output value.
-        for (Instruction *UsrI : OutsideUsers)
-          UsrI->replaceUsesOfWith(&I, LoadI);
-      }
+          LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
+                                         I.getName() + ".guarded.output.load",
+                                         RegionBarrierBB->getTerminator());
 
-      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+          // Emit a load instruction and replace uses of the output value.
+          for (Instruction *UsrI : OutsideUsers)
+            UsrI->replaceUsesOfWith(&I, LoadI);
+        }
 
-      // Go to tid check BB in ParentBB.
-      const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
-      ParentBB->getTerminator()->eraseFromParent();
-      OpenMPIRBuilder::LocationDescription Loc(
-          InsertPointTy(ParentBB, ParentBB->end()), DL);
-      OMPInfoCache.OMPBuilder.updateToLocation(Loc);
-      uint32_t SrcLocStrSize;
-      auto *SrcLocStr =
-          OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-      Value *Ident =
-          OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-      BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
-
-      // Add check for Tid in RegionCheckTidBB
-      RegionCheckTidBB->getTerminator()->eraseFromParent();
-      OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
-          InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
-      OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
-      FunctionCallee HardwareTidFn =
-          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
-              M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
-      CallInst *Tid =
-          OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
-      Tid->setDebugLoc(DL);
-      OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
-      Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
-      OMPInfoCache.OMPBuilder.Builder
-          .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
-          ->setDebugLoc(DL);
-
-      // First barrier for synchronization, ensures main thread has updated
-      // values.
-      FunctionCallee BarrierFn =
-          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
-              M, OMPRTL___kmpc_barrier_simple_spmd);
-      OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
-          RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
-      CallInst *Barrier =
-          OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
-      Barrier->setDebugLoc(DL);
-      OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
-
-      // Second barrier ensures workers have read broadcast values.
-      if (HasBroadcastValues) {
-        CallInst *Barrier = CallInst::Create(BarrierFn, {Ident, Tid}, "",
-                                             RegionBarrierBB->getTerminator());
+        auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+        // Go to tid check BB in ParentBB.
+        const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
+        ParentBB->getTerminator()->eraseFromParent();
+        OpenMPIRBuilder::LocationDescription Loc(
+            InsertPointTy(ParentBB, ParentBB->end()), DL);
+        OMPInfoCache.OMPBuilder.updateToLocation(Loc);
+        uint32_t SrcLocStrSize;
+        auto *SrcLocStr =
+            OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+        Value *Ident =
+            OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+        BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
+
+        // Add check for Tid in RegionCheckTidBB
+        RegionCheckTidBB->getTerminator()->eraseFromParent();
+        OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
+            InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
+        OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
+        FunctionCallee HardwareTidFn =
+            OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+                M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
+        CallInst *Tid =
+            OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
+        Tid->setDebugLoc(DL);
+        OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
+        Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
+        OMPInfoCache.OMPBuilder.Builder
+            .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
+            ->setDebugLoc(DL);
+
+        // First barrier for synchronization, ensures main thread has updated
+        // values.
+        FunctionCallee BarrierFn =
+            OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+                M, OMPRTL___kmpc_barrier_simple_spmd);
+        OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
+            RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
+        CallInst *Barrier =
+            OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
         Barrier->setDebugLoc(DL);
         OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
-      }
-    };
 
-    auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
-    SmallPtrSet<BasicBlock *, 8> Visited;
-    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
-      BasicBlock *BB = GuardedI->getParent();
-      if (!Visited.insert(BB).second)
-        continue;
+        // Second barrier ensures workers have read broadcast values.
+        if (HasBroadcastValues) {
+          CallInst *Barrier = CallInst::Create(BarrierFn, {Ident, Tid}, "",
+                                               RegionBarrierBB->getTerminator());
+          Barrier->setDebugLoc(DL);
+          OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
+        }
+      };
 
-      SmallVector<std::pair<Instruction *, Instruction *>> Reorders;
-      Instruction *LastEffect = nullptr;
-      BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
-      while (++IP != IPEnd) {
-        if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
-          continue;
-        Instruction *I = &*IP;
-        if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
-          continue;
-        if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
-          LastEffect = nullptr;
+      auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+      SmallPtrSet<BasicBlock *, 8> Visited;
+      for (Instruction *GuardedI : SPMDCompatibilityTracker) {
+        BasicBlock *BB = GuardedI->getParent();
+        if (!Visited.insert(BB).second)
           continue;
+
+        SmallVector<std::pair<Instruction *, Instruction *>> Reorders;
+        Instruction *LastEffect = nullptr;
+        BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
+        while (++IP != IPEnd) {
+          if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
+            continue;
+          Instruction *I = &*IP;
+          if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
+            continue;
+          if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
+            LastEffect = nullptr;
+            continue;
+          }
+          if (LastEffect)
+            Reorders.push_back({I, LastEffect});
+          LastEffect = &*IP;
         }
-        if (LastEffect)
-          Reorders.push_back({I, LastEffect});
-        LastEffect = &*IP;
+        for (auto &Reorder : Reorders)
+          Reorder.first->moveBefore(Reorder.second);
       }
-      for (auto &Reorder : Reorders)
-        Reorder.first->moveBefore(Reorder.second);
-    }
 
-    SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
-
-    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
-      BasicBlock *BB = GuardedI->getParent();
-      auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
-          IRPosition::function(*GuardedI->getFunction()), nullptr,
-          DepClassTy::NONE);
-      assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
-      auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
-      // Continue if instruction is already guarded.
-      if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
-        continue;
+      SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
+
+      for (Instruction *GuardedI : SPMDCompatibilityTracker) {
+        BasicBlock *BB = GuardedI->getParent();
+        auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
+            IRPosition::function(*GuardedI->getFunction()), nullptr,
+            DepClassTy::NONE);
+        assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
+        auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
+        // Continue if instruction is already guarded.
+        if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
+          continue;
 
-      Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
-      for (Instruction &I : *BB) {
-        // If instruction I needs to be guarded update the guarded region
-        // bounds.
-        if (SPMDCompatibilityTracker.contains(&I)) {
-          CalleeAAFunction.getGuardedInstructions().insert(&I);
-          if (GuardedRegionStart)
-            GuardedRegionEnd = &I;
-          else
-            GuardedRegionStart = GuardedRegionEnd = &I;
+        Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
+        for (Instruction &I : *BB) {
+          // If instruction I needs to be guarded update the guarded region
+          // bounds.
+          if (SPMDCompatibilityTracker.contains(&I)) {
+            CalleeAAFunction.getGuardedInstructions().insert(&I);
+            if (GuardedRegionStart)
+              GuardedRegionEnd = &I;
+            else
+              GuardedRegionStart = GuardedRegionEnd = &I;
 
-          continue;
-        }
+            continue;
+          }
 
-        // Instruction I does not need guarding, store
-        // any region found and reset bounds.
-        if (GuardedRegionStart) {
-          GuardedRegions.push_back(
-              std::make_pair(GuardedRegionStart, GuardedRegionEnd));
-          GuardedRegionStart = nullptr;
-          GuardedRegionEnd = nullptr;
+          // Instruction I does not need guarding, store
+          // any region found and reset bounds.
+          if (GuardedRegionStart) {
+            GuardedRegions.push_back(
+                std::make_pair(GuardedRegionStart, GuardedRegionEnd));
+            GuardedRegionStart = nullptr;
+            GuardedRegionEnd = nullptr;
+          }
         }
       }
-    }
 
-    for (auto &GR : GuardedRegions)
-      CreateGuardedRegion(GR.first, GR.second);
+      for (auto &GR : GuardedRegions)
+        CreateGuardedRegion(GR.first, GR.second);
+    } else {
+      // Only allow 1 thread per block to continue executing the user code.
+      //
+      //     InitCB = __kmpc_target_init(...)
+      //     ThreadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
+      //     if (ThreadIdInBlock != 0) return;
+      // UserCode:
+      //     // user code
+      //
+      Function *Kernel = getAssociatedFunction();
+      assert(Kernel && "Expected an associated function!");
+      auto &Ctx = getAnchorValue().getContext();
+
+      // Create block for user code to branch to from initial block.
+      BasicBlock *InitBB = KernelInitCB->getParent();
+      BasicBlock *UserCodeBB = InitBB->splitBasicBlock(
+          KernelInitCB->getNextNode(), "main.thread.user_code");
+      BasicBlock *ReturnBB = BasicBlock::Create(
+          Ctx, "exit.threads", Kernel, UserCodeBB);
+
+      // Register blocks with attributor:
+      A.registerManifestAddedBasicBlock(*InitBB);
+      A.registerManifestAddedBasicBlock(*UserCodeBB);
+      A.registerManifestAddedBasicBlock(*ReturnBB);
+
+      // Debug location:
+      const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
+      ReturnInst::Create(Ctx, ReturnBB)->setDebugLoc(DLoc);
+      InitBB->getTerminator()->eraseFromParent();
+
+      // Prepare function call to OMPRTL___kmpc_get_hardware_thread_id_in_block:
+      Module &M = *Kernel->getParent();
+      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+      FunctionCallee ThreadIdInBlockFn =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
+
+      // Get thread ID in block.
+      CallInst *ThreadIdInBlock =
+          CallInst::Create(ThreadIdInBlockFn, "thread_id.in.block", InitBB);
+      OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
+      ThreadIdInBlock->setDebugLoc(DLoc);
+
+      // Eliminate all threads in the block with ID not equal to 0:
+      Instruction *IsMainThread =
+          ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, ThreadIdInBlock,
+                          ConstantInt::get(ThreadIdInBlock->getType(), 0),
+                          "thread.is_main", InitBB);
+      IsMainThread->setDebugLoc(DLoc);
+      BranchInst::Create(ReturnBB, UserCodeBB, IsMainThread, InitBB);
+    }
 
     // Adjust the global exec mode flag that tells the runtime what mode this
     // kernel is executed in.
@@ -3633,7 +3684,6 @@
     const int InitRequiresFullRuntimeArgNo = 3;
     const int DeinitRequiresFullRuntimeArgNo = 2;
 
-    auto &Ctx = getAnchorValue().getContext();
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitModeArgNo),
         *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -8,9 +8,9 @@
 
 @G = external global i32
 ;.
-; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
-; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
+; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
@@ -18,11 +18,21 @@
 define weak void @kernel0() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel0
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false)
+
+; This is a target region without a parallel region so it is SPMD but uses just 1 thread per team
+; CHECK-NEXT:    [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[IS_MAIN_THREAD:%.*]] = icmp ne i32 [[THREAD_ID_IN_BLOCK]], 0
+; CHECK-NEXT:    br i1 [[IS_MAIN_THREAD]], label %[[EXIT_THREADS_BB:.*]], label %[[MAIN_THREAD_USER_CODE_BB:.*]]
+
+; CHECK:       [[EXIT_THREADS_BB]]:
+; CHECK-NEXT:    ret void
+
+; CHECK:       [[MAIN_THREAD_USER_CODE_BB]]:
 ; CHECK-NEXT:    call void @helper0() #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    call void @helper1() #[[ATTR1]]
 ; CHECK-NEXT:    call void @helper2() #[[ATTR1]]
-; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false)
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
@@ -38,9 +48,19 @@
 define weak void @kernel1() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel1
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false)
+
+; This is a target region without a parallel region so it is SPMD but uses just 1 thread per team
+; CHECK-NEXT:    [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[IS_MAIN_THREAD:%.*]] = icmp ne i32 [[THREAD_ID_IN_BLOCK]], 0
+; CHECK-NEXT:    br i1 [[IS_MAIN_THREAD]], label %[[EXIT_THREADS_BB:.*]], label %[[MAIN_THREAD_USER_CODE_BB:.*]]
+
+; CHECK:       [[EXIT_THREADS_BB]]:
+; CHECK-NEXT:    ret void
+
+; CHECK:       [[MAIN_THREAD_USER_CODE_BB]]:
 ; CHECK-NEXT:    call void @helper1() #[[ATTR1]]
-; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false)
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false)