Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
===================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -86,7 +86,7 @@
 
     T inputUb = ub;
     ub = lb + chunk - 1; // Clang uses i <= ub
-    last = ub == inputUb;
+    last = lb <= inputUb && inputUb <= ub;
     stride = loopSize; // make sure we only do 1 chunk per warp
   }
 
Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
===================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -161,6 +161,11 @@
                                      kmp_InterWarpCopyFctPtr cpyFct,
                                      bool isSPMDExecutionMode,
                                      bool isRuntimeUninitialized = false) {
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+  uint32_t NumThreads = GetNumberOfOmpThreads(
+      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+  if (NumThreads == 1)
+    return 1;
   /*
    * This reduce function handles reduction within a team. It handles
    * parallel regions in both L1 and L2 parallelism levels. It also
@@ -173,9 +178,6 @@
    */
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
   uint32_t WarpId = BlockThreadId / WARPSIZE;
 
@@ -219,10 +221,6 @@
                                     // early.
     return gpu_irregular_simd_reduce(reduce_data, shflFct);
 
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
-
   // When we have more than [warpsize] number of threads
   // a block reduction is performed here.
   //
Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
===================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -35,40 +35,46 @@
 
 EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
-  __syncthreads();
+  __kmpc_barrier(loc_ref, tid);
   PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
   return 0;
 }
 
 EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
-  tid = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-  if (!currTaskDescr->InL2OrHigherParallelRegion()) {
-    int numberOfActiveOMPThreads =
-        GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+  if (isSPMDMode()) {
+    __kmpc_barrier_simple_spmd(loc_ref, tid);
+  } else if (isRuntimeUninitialized()) {
+    __kmpc_barrier_simple_generic(loc_ref, tid);
+  } else {
+    tid = GetLogicalThreadIdInBlock();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
+    if (!currTaskDescr->InL2OrHigherParallelRegion()) {
+      int numberOfActiveOMPThreads =
+          GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    // On Volta and newer architectures we require that all lanes in
-    // a warp (at least, all present for the kernel launch) participate in the
-    // barrier.  This is enforced when launching the parallel region.  An
-    // exception is when there are < WARPSIZE workers.  In this case only 1
-    // worker is started, so we don't need a barrier.
-    if (numberOfActiveOMPThreads > 1) {
+      // On Volta and newer architectures we require that all lanes in
+      // a warp (at least, all present for the kernel launch) participate in the
+      // barrier.  This is enforced when launching the parallel region.  An
+      // exception is when there are < WARPSIZE workers.  In this case only 1
+      // worker is started, so we don't need a barrier.
+      if (numberOfActiveOMPThreads > 1) {
 #endif
-      // The #threads parameter must be rounded up to the WARPSIZE.
-      int threads =
-          WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
-      PRINT(LD_SYNC,
-            "call kmpc_barrier with %d omp threads, sync parameter %d\n",
-            numberOfActiveOMPThreads, threads);
-      // Barrier #1 is for synchronization among active threads.
-      named_sync(L1_BARRIER, threads);
+        // The #threads parameter must be rounded up to the WARPSIZE.
+        int threads =
+            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+        PRINT(LD_SYNC,
+              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+              numberOfActiveOMPThreads, threads);
+        // Barrier #1 is for synchronization among active threads.
+        named_sync(L1_BARRIER, threads);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    } // numberOfActiveOMPThreads > 1
+      } // numberOfActiveOMPThreads > 1
 #endif
+    }
+    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
   }
-  PRINT0(LD_SYNC, "completed kmpc_barrier\n");
 }
 
 // Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0