Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -31,7 +31,8 @@ __device__ __shared__ uint32_t usedMemIdx; __device__ __shared__ uint32_t usedSlotIdx; -__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +__device__ __shared__ volatile uint8_t + parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; __device__ __shared__ uint16_t threadLimit; __device__ __shared__ uint16_t threadsInTeam; __device__ __shared__ uint16_t nThreads; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -398,7 +398,7 @@ omptarget_nvptx_simpleMemoryManager; extern __device__ __shared__ uint32_t usedMemIdx; extern __device__ __shared__ uint32_t usedSlotIdx; -extern __device__ __shared__ uint8_t +extern __device__ __shared__ volatile uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; extern __device__ __shared__ uint16_t threadLimit; extern __device__ __shared__ uint16_t threadsInTeam; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -62,6 +62,8 @@ // Barrier #1 is for synchronization among active threads. named_sync(L1_BARRIER, threads); } + } else { + __kmpc_flush(loc_ref); } // numberOfActiveOMPThreads > 1 PRINT0(LD_SYNC, "completed kmpc_barrier\n"); } @@ -130,7 +132,7 @@ EXTERN void __kmpc_flush(kmp_Ident *loc) { PRINT0(LD_IO, "call kmpc_flush\n"); - __threadfence_system(); + __threadfence(); } ////////////////////////////////////////////////////////////////////////////////