diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -10,6 +10,8 @@ // //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#include "target_impl.h" + #include // Warp ID in the CUDA block @@ -552,7 +554,7 @@ *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); } // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. - __SYNCTHREADS(); + __kmpc_impl_syncthreads(); return; } ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), @@ -567,7 +569,7 @@ return; if (isSPMDExecutionMode) { // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. - __SYNCTHREADS(); + __kmpc_impl_syncthreads(); if (GetThreadIdInBlock() == 0) { omptarget_nvptx_simpleMemoryManager.Release(); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -65,14 +65,6 @@ #define __SYNCWARP(Mask) #endif // CUDA_VERSION -#define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); -// Use original __syncthreads if compiled by nvcc or clang >= 9.0. -#if !defined(__clang__) || __clang_major__ >= 9 -#define __SYNCTHREADS() __syncthreads() -#else -#define __SYNCTHREADS() __SYNCTHREADS_N(0) -#endif - // arguments needed for L0 parallelism only. class omptarget_nvptx_SharedArgs { public: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// // global data tables @@ -106,7 +107,7 @@ } if (!RequiresOMPRuntime) { // Runtime is not required - exit. - __SYNCTHREADS(); + __kmpc_impl_syncthreads(); return; } @@ -126,7 +127,7 @@ currTeamDescr.InitTeamDescr(); } // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. - __SYNCTHREADS(); + __kmpc_impl_syncthreads(); omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); @@ -169,7 +170,7 @@ return; // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. - __SYNCTHREADS(); + __kmpc_impl_syncthreads(); int threadId = GetThreadIdInBlock(); if (threadId == 0) { // Enqueue omp state object for use by another team. diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -76,7 +76,7 @@ EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) { PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n"); // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. - __SYNCTHREADS(); + __kmpc_impl_syncthreads(); PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -38,6 +38,19 @@ INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); } +#ifndef CUDA_VERSION +#error CUDA_VERSION macro is undefined, something wrong with cuda. +#endif + +INLINE void __kmpc_impl_syncthreads() { + // Use original __syncthreads if compiled by nvcc or clang >= 9.0. +#if !defined(__clang__) || __clang_major__ >= 9 + __syncthreads(); +#else + asm volatile("bar.sync %0;" : : "r"(0) : "memory"); +#endif +} + INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); } #endif