Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -86,7 +86,7 @@ T inputUb = ub; ub = lb + chunk - 1; // Clang uses i <= ub - last = ub == inputUb; + last = lb <= inputUb && inputUb <= ub; stride = loopSize; // make sure we only do 1 chunk per warp } Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -161,6 +161,11 @@ kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) { + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); + uint32_t NumThreads = GetNumberOfOmpThreads( + BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); + if (NumThreads == 1) + return 1; /* * This reduce function handles reduction within a team. It handles * parallel regions in both L1 and L2 parallelism levels. It also @@ -173,9 +178,6 @@ */ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); - uint32_t NumThreads = GetNumberOfOmpThreads( - BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; uint32_t WarpId = BlockThreadId / WARPSIZE; @@ -219,10 +221,6 @@ // early. return gpu_irregular_simd_reduce(reduce_data, shflFct); - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); - uint32_t NumThreads = GetNumberOfOmpThreads( - BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); - // When we have more than [warpsize] number of threads // a block reduction is performed here. // Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -35,40 +35,46 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) { PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); - __syncthreads(); + __kmpc_barrier(loc_ref, tid); PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); return 0; } EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) { - tid = GetLogicalThreadIdInBlock(); - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); - if (!currTaskDescr->InL2OrHigherParallelRegion()) { - int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + if (isSPMDMode()) { + __kmpc_barrier_simple_spmd(loc_ref, tid); + } else if (isRuntimeUninitialized()) { + __kmpc_barrier_simple_generic(loc_ref, tid); + } else { + tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); + if (!currTaskDescr->InL2OrHigherParallelRegion()) { + int numberOfActiveOMPThreads = + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp (at least, all present for the kernel launch) participate in the - // barrier. This is enforced when launching the parallel region. An - // exception is when there are < WARPSIZE workers. In this case only 1 - // worker is started, so we don't need a barrier. - if (numberOfActiveOMPThreads > 1) { + // On Volta and newer architectures we require that all lanes in + // a warp (at least, all present for the kernel launch) participate in the + // barrier. This is enforced when launching the parallel region. An + // exception is when there are < WARPSIZE workers. In this case only 1 + // worker is started, so we don't need a barrier. + if (numberOfActiveOMPThreads > 1) { #endif - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier with %d omp threads, sync parameter %d\n", - numberOfActiveOMPThreads, threads); - // Barrier #1 is for synchronization among active threads. - named_sync(L1_BARRIER, threads); + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier with %d omp threads, sync parameter %d\n", + numberOfActiveOMPThreads, threads); + // Barrier #1 is for synchronization among active threads. + named_sync(L1_BARRIER, threads); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - } // numberOfActiveOMPThreads > 1 + } // numberOfActiveOMPThreads > 1 #endif + } + PRINT0(LD_SYNC, "completed kmpc_barrier\n"); } - PRINT0(LD_SYNC, "completed kmpc_barrier\n"); } // Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0