Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -3456,7 +3456,11 @@
     // Create all the blocks:
     //
     //                       InitCB = __kmpc_target_init(...)
-    //                       bool IsWorker = InitCB >= 0;
+    //                       BlockHwSize =
+    //                         __kmpc_get_hardware_num_threads_in_block();
+    //                       WarpSize = __kmpc_get_warp_size();
+    //                       BlockSize = BlockHwSize - WarpSize;
+    //                       bool IsWorker = InitCB >= 0 && InitCB < BlockSize;
     //                       if (IsWorker) {
     // SMBeginBB:               __kmpc_barrier_simple_generic(...);
     //                         void *WorkFn;
@@ -3509,16 +3513,38 @@
 
     const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
     ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
-
     InitBB->getTerminator()->eraseFromParent();
-    Instruction *IsWorker =
-        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
-                         ConstantInt::get(KernelInitCB->getType(), -1),
-                         "thread.is_worker", InitBB);
-    IsWorker->setDebugLoc(DLoc);
-    BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
 
     Module &M = *Kernel->getParent();
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    FunctionCallee BlockHwSizeFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
+    FunctionCallee WarpSizeFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_get_warp_size);
+    Instruction *BlockHwSize =
+        CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
+    BlockHwSize->setDebugLoc(DLoc);
+    Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
+    WarpSize->setDebugLoc(DLoc);
+    Instruction *BlockSize =
+        BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
+    BlockSize->setDebugLoc(DLoc);
+    Instruction *InBlock =
+        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_ULT, KernelInitCB,
+                         BlockSize, "thread.in_block", InitBB);
+    InBlock->setDebugLoc(DLoc);
+
+    Instruction *NotMaster =
+        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SGE, KernelInitCB,
+                         ConstantInt::get(KernelInitCB->getType(), 0),
+                         "thread.not_master", InitBB);
+    NotMaster->setDebugLoc(DLoc);
+    Instruction *IsWorker = BinaryOperator::CreateAnd(
+        NotMaster, InBlock, "thread.is_worker", InitBB);
+    IsWorker->setDebugLoc(DLoc);
+    BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
 
     // Create local storage for the work function pointer.
     const DataLayout &DL = M.getDataLayout();
@@ -3528,7 +3554,6 @@
                        "worker.work_fn.addr", &Kernel->getEntryBlock().front());
     WorkFnAI->setDebugLoc(DLoc);
 
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     OMPInfoCache.OMPBuilder.updateToLocation(
         OpenMPIRBuilder::LocationDescription(
             IRBuilder<>::InsertPoint(StateMachineBeginBB,
Index: openmp/libomptarget/DeviceRTL/src/Kernel.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -86,7 +86,21 @@
   if (mapping::isInitialThreadInLevel0(IsSPMD))
     return -1;
 
-  if (UseGenericStateMachine)
+  // Enter the generic state machine if enabled and if this thread can possibly
+  // be an active worker thread.
+  //
+  // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
+  // GPU.  In those cases, a single thread can apparently satisfy a barrier on
+  // behalf of all threads in the same warp.  Thus, it would not be safe for
+  // other threads in the master thread's warp to reach the first
+  // synchronize::threads call in genericStateMachine before the master thread
+  // reaches its corresponding synchronize::threads call: that would permit all
+  // active worker threads to proceed before the master thread has actually set
+  // state::ParallelRegionFn, and then they would immediately quit without
+  // doing any work.  mapping::getBlockSize() does not include any of the master
+  // thread's warp, so none of its threads can ever be active worker threads.
+  if (UseGenericStateMachine &&
+      mapping::getThreadIdInBlock() < mapping::getBlockSize())
     genericStateMachine(Ident);
 
   return mapping::getThreadIdInBlock();
Index: openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -179,8 +179,8 @@
 EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
 
 static void __kmpc_target_region_state_machine(ident_t *Ident) {
-
   int TId = __kmpc_get_hardware_thread_id_in_block();
+
   do {
     void* WorkFn = 0;
 
@@ -225,7 +225,22 @@
    if (TId == GetMasterThreadID())
      return -1;
 
-  if (UseGenericStateMachine)
+  // Enter the generic state machine if enabled and if this thread can possibly
+  // be an active worker thread.
+  //
+  // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
+  // GPU.  In those cases, a single thread can apparently satisfy a barrier on
+  // behalf of all threads in the same warp.  Thus, it would not be safe for
+  // other threads in the master thread's warp to reach the first
+  // __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine
+  // before the master thread reaches its corresponding
+  // __kmpc_barrier_simple_spmd call: that would permit all active worker
+  // threads to proceed before the master thread has actually set
+  // omptarget_nvptx_workFn, and then they would immediately quit without
+  // doing any work.  GetNumberOfWorkersInTeam() does not include any of the
+  // master thread's warp, so none of its threads can ever be active worker
+  // threads.
+  if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam())
     __kmpc_target_region_state_machine(Ident);
 
   return TId;
Index: openmp/libomptarget/deviceRTLs/common/src/support.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -231,6 +231,7 @@
 __attribute__((used, weak, optnone)) void keepAlive() {
   __kmpc_get_hardware_thread_id_in_block();
   __kmpc_get_hardware_num_threads_in_block();
+  __kmpc_get_warp_size();
   __kmpc_barrier_simple_spmd(nullptr, 0);
   __kmpc_barrier_simple_generic(nullptr, 0);
 }
Index: openmp/libomptarget/deviceRTLs/target_interface.h
===================================================================
--- openmp/libomptarget/deviceRTLs/target_interface.h
+++ openmp/libomptarget/deviceRTLs/target_interface.h
@@ -23,6 +23,7 @@
 EXTERN int GetBlockIdInKernel();
 EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
 EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
+EXTERN unsigned __kmpc_get_warp_size();
 EXTERN unsigned GetWarpId();
 EXTERN unsigned GetWarpSize();
 EXTERN unsigned GetLaneId();
Index: openmp/libomptarget/test/offloading/bug51781.c
===================================================================
--- /dev/null
+++ openmp/libomptarget/test/offloading/bug51781.c
@@ -0,0 +1,38 @@
+// Use the generic state machine.  On some architectures, other threads in the
+// master thread's warp must avoid barrier instructions.
+//
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// SPMDize.  There is no master thread, so there's no issue.
+//
+// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt > %t.spmd 2>&1
+// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=SPMD -input-file=%t.spmd
+// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=SPMD -input-file=%t.spmd
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
+//
+// SPMD: Transformed generic-mode kernel to SPMD-mode.
+
+// Use the custom state machine, which must avoid the same barrier problem as
+// the generic state machine.
+//
+// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt \
+// RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
+// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
+// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
+//
+// CUSTOM: Rewriting generic-mode kernel with a customized state machine.
+
+#include <stdio.h>
+int main() {
+  int x = 0, y = 1;
+  #pragma omp target teams num_teams(1) map(tofrom:x, y)
+  {
+    x = 5;
+    #pragma omp parallel
+    y = 6;
+  }
+  // CHECK: 5, 6
+  printf("%d, %d\n", x, y);
+  return 0;
+}