diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#include "target_impl.h" #include // Warp ID in the CUDA block @@ -19,7 +20,7 @@ // Return true if this is the first active thread in the warp. INLINE static bool IsWarpMasterActiveThread() { - unsigned long long Mask = __ACTIVEMASK(); + unsigned long long Mask = __kmpc_impl_activemask(); unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); unsigned long long Sh = Mask << ShNum; // Truncate Sh to the 32 lower bits @@ -111,7 +112,7 @@ (unsigned long long)SharingDefaultDataSize); unsigned WID = getWarpId(); - unsigned CurActiveThreads = __ACTIVEMASK(); + unsigned CurActiveThreads = __kmpc_impl_activemask(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; @@ -251,7 +252,7 @@ return; } - int32_t CurActive = __ACTIVEMASK(); + int32_t CurActive = __kmpc_impl_activemask(); // Only the warp master can restore the stack and frame information, and only // if there are no other threads left behind in this environment (i.e. the @@ -377,7 +378,7 @@ // Frame pointer must be visible to all workers in the same warp. const unsigned WID = getWarpId(); void *FrameP = 0; - int32_t CurActive = __ACTIVEMASK(); + int32_t CurActive = __kmpc_impl_activemask(); if (IsWarpMaster) { // SlotP will point to either the shared memory slot or an existing @@ -430,9 +431,10 @@ } } // Get address from lane 0. - ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0); + int *FP = (int *)&FrameP; + FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0); if (sizeof(FrameP) == 8) - ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0); + FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0); return FrameP; } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -383,13 +383,13 @@ INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) { int lo, hi; __kmpc_impl_unpack(val, lo, hi); - hi = __SHFL_SYNC(active, hi, leader); - lo = __SHFL_SYNC(active, lo, leader); + hi = __kmpc_impl_shfl_sync(active, hi, leader); + lo = __kmpc_impl_shfl_sync(active, lo, leader); return __kmpc_impl_pack(lo, hi); } INLINE static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = __ACTIVEMASK(); + __kmpc_impl_lanemask_t active = __kmpc_impl_activemask(); int leader = __kmpc_impl_ffs(active) - 1; int change = __kmpc_impl_popc(active); __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -45,34 +45,6 @@ #define BARRIER_COUNTER 0 #define ORDERED_COUNTER 1 -// Macros for Cuda intrinsics -// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. -// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). -#ifndef CUDA_VERSION -#error CUDA_VERSION macro is undefined, something wrong with cuda. -#elif CUDA_VERSION >= 9000 -#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) -#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ - __shfl_down_sync((mask), (var), (delta), (width)) -#define __ACTIVEMASK() __activemask() -#define __SYNCWARP(Mask) __syncwarp(Mask) -#else -#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) -#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ - __shfl_down((var), (delta), (width)) -#define __ACTIVEMASK() __ballot(1) -// In Cuda < 9.0 no need to sync threads in warps. -#define __SYNCWARP(Mask) -#endif // CUDA_VERSION - -#define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); -// Use original __syncthreads if compiled by nvcc or clang >= 9.0. -#if !defined(__clang__) || __clang_major__ >= 9 -#define __SYNCTHREADS() __syncthreads() -#else -#define __SYNCTHREADS() __SYNCTHREADS_N(0) -#endif - // arguments needed for L0 parallelism only. class omptarget_nvptx_SharedArgs { public: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -33,6 +33,7 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#include "target_impl.h" typedef struct ConvergentSimdJob { omptarget_nvptx_TaskDescr taskDescr; @@ -64,7 +65,7 @@ omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); job->slimForNextSimd = SimdLimit; - int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource); + int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource); // reset simdlimit to avoid propagating to successive #simd if (SimdLimitSource > 0 && threadId == sourceThreadId) omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; @@ -138,7 +139,8 @@ omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); job->tnumForNextPar = NumThreadsClause; - int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource); + int32_t NumThreadsSource = + __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource); // reset numthreads to avoid propagating to successive #parallel if (NumThreadsSource > 0 && threadId == sourceThreadId) omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -15,6 +15,7 @@ #include #include "omptarget-nvptx.h" +#include "target_impl.h" EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid) {} @@ -23,14 +24,14 @@ void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { - return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size); + return __kmpc_impl_shfl_down_sync(0xFFFFFFFF, val, delta, size); } EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { int lo, hi; asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); - hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size); - lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size); + hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size); + lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size); asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); return val; } @@ -64,11 +65,11 @@ uint32_t size, remote_id, physical_lane_id; physical_lane_id = GetThreadIdInBlock() % WARPSIZE; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); - uint32_t Liveness = __ACTIVEMASK(); + uint32_t Liveness = __kmpc_impl_activemask(); uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt)); do { - Liveness = __ACTIVEMASK(); + Liveness = __kmpc_impl_activemask(); remote_id = __ffs(Liveness & lanemask_gt); size = __popc(Liveness); logical_lane_id /= 2; @@ -83,7 +84,7 @@ size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - uint32_t Liveness = __ACTIVEMASK(); + uint32_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) { gpu_regular_warp_reduce(reduce_data, shflFct); return GetThreadIdInBlock() % WARPSIZE == @@ -144,7 +145,7 @@ } return BlockThreadId == 0; #else - uint32_t Liveness = __ACTIVEMASK(); + uint32_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes @@ -319,7 +320,7 @@ ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); // Reduce across warps to the warp master. - uint32_t Liveness = __ACTIVEMASK(); + uint32_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else // Partial warp but contiguous lanes diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -14,6 +14,8 @@ // Execution Parameters //////////////////////////////////////////////////////////////////////////////// +#include "target_impl.h" + INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { execution_param = EMode; execution_param |= RMode; @@ -202,8 +204,8 @@ // Parallel level INLINE void IncParallelLevel(bool ActiveParallel) { - unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); + unsigned Active = __kmpc_impl_activemask(); + __kmpc_impl_syncwarp(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); unsigned Rank = __popc(Active & LaneMaskLt); @@ -212,12 +214,12 @@ (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); } INLINE void DecParallelLevel(bool ActiveParallel) { - unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); + unsigned Active = __kmpc_impl_activemask(); + __kmpc_impl_syncwarp(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); unsigned Rank = __popc(Active & LaneMaskLt); @@ -226,7 +228,7 @@ (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -143,7 +143,7 @@ EXTERN int32_t __kmpc_warp_active_thread_mask() { PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); - return __ACTIVEMASK(); + return __kmpc_impl_activemask(); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -38,6 +38,55 @@ INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); } -INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); } +#ifndef CUDA_VERSION +#error CUDA_VERSION macro is undefined, something wrong with cuda. +#endif + +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. +// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). + +INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, + int32_t SrcLane) { +#if CUDA_VERSION >= 9000 + return __shfl_sync(Mask, Var, SrcLane); +#else + return __shfl(Var, SrcLane); +#endif +} + +INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, + int32_t Var, uint32_t Delta, + int32_t Width) { +#if CUDA_VERSION >= 9000 + return __shfl_down_sync(Mask, Var, Delta, Width); +#else + return __shfl_down(Var, Delta, Width); +#endif +} + +INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { +#if CUDA_VERSION >= 9000 + return __activemask(); +#else + return __ballot(1); +#endif +} + +INLINE void __kmpc_impl_syncthreads() { + // Use original __syncthreads if compiled by nvcc or clang >= 9.0. +#if !defined(__clang__) || __clang_major__ >= 9 + __syncthreads(); +#else + asm volatile("bar.sync %0;" : : "r"(0) : "memory"); +#endif +} + +INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { +#if CUDA_VERSION >= 9000 + __syncwarp(Mask); +#else + // In Cuda < 9.0 no need to sync threads in warps. +#endif +} #endif