diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -765,7 +765,7 @@ // is started, so we don't need a barrier. if (NumThreads > 1) { #endif - named_sync(L1_BARRIER, WARPSIZE * NumWarps); + __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 } #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -256,7 +256,7 @@ // If we guard this barrier as follows it leads to deadlock, probably // because of a compiler bug: if (!IsGenericMode()) __syncthreads(); uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE; - named_sync(L1_BARRIER, SyncWarps * WARPSIZE); + __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE); // If this team is not the last, quit. if (/* Volatile read by all threads */ !IsLastTeam) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h @@ -84,11 +84,6 @@ ((void *)((char *)((void *)(_addr)) - (_bytes))) //////////////////////////////////////////////////////////////////////////////// -// Named Barrier Routines -//////////////////////////////////////////////////////////////////////////////// -INLINE void named_sync(const int barrier, const int num_threads); - -//////////////////////////////////////////////////////////////////////////////// // Teams Reduction Scratchpad Helpers //////////////////////////////////////////////////////////////////////////////// INLINE unsigned int *GetTeamsReductionTimestamp(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -269,17 +269,6 @@ } //////////////////////////////////////////////////////////////////////////////// -// Named Barrier Routines -//////////////////////////////////////////////////////////////////////////////// - -INLINE void named_sync(const int barrier, const int num_threads) { - asm volatile("bar.sync %0, %1;" - : - : "r"(barrier), "r"(num_threads) - : "memory"); -} - -//////////////////////////////////////////////////////////////////////////////// // Teams Reduction Scratchpad Helpers //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -61,7 +61,7 @@ "call kmpc_barrier with %d omp threads, sync parameter %d\n", (int)numberOfActiveOMPThreads, (int)threads); // Barrier #1 is for synchronization among active threads. - named_sync(L1_BARRIER, threads); + __kmpc_impl_named_sync(L1_BARRIER, threads); } } else { // Still need to flush the memory per the standard. @@ -92,7 +92,7 @@ "%d\n", (int)numberOfActiveOMPThreads, (int)threads); // Barrier #1 is for synchronization among active threads. - named_sync(L1_BARRIER, threads); + __kmpc_impl_named_sync(L1_BARRIER, threads); PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n"); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -153,4 +153,11 @@ #endif // CUDA_VERSION } +INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) { + asm volatile("bar.sync %0, %1;" + : + : "r"(barrier), "r"(num_threads) + : "memory"); +} + #endif