diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#include "target_impl.h" #include // Warp ID in the CUDA block @@ -19,7 +20,7 @@ // Return true if this is the first active thread in the warp. INLINE static bool IsWarpMasterActiveThread() { - unsigned long long Mask = __ACTIVEMASK(); + unsigned long long Mask = __kmpc_impl_activemask(); unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); unsigned long long Sh = Mask << ShNum; // Truncate Sh to the 32 lower bits @@ -111,7 +112,7 @@ (unsigned long long)SharingDefaultDataSize); unsigned WID = getWarpId(); - unsigned CurActiveThreads = __ACTIVEMASK(); + unsigned CurActiveThreads = __kmpc_impl_activemask(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; @@ -251,7 +252,7 @@ return; } - int32_t CurActive = __ACTIVEMASK(); + int32_t CurActive = __kmpc_impl_activemask(); // Only the warp master can restore the stack and frame information, and only // if there are no other threads left behind in this environment (i.e. the @@ -377,7 +378,7 @@ // Frame pointer must be visible to all workers in the same warp. const unsigned WID = getWarpId(); void *FrameP = 0; - int32_t CurActive = __ACTIVEMASK(); + int32_t CurActive = __kmpc_impl_activemask(); if (IsWarpMaster) { // SlotP will point to either the shared memory slot or an existing diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -389,7 +389,7 @@ } INLINE static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = __ACTIVEMASK(); + __kmpc_impl_lanemask_t active = __kmpc_impl_activemask(); int leader = __kmpc_impl_ffs(active) - 1; int change = __kmpc_impl_popc(active); __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -47,20 +47,17 @@ // Macros for Cuda intrinsics // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. -// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). #ifndef CUDA_VERSION #error CUDA_VERSION macro is undefined, something wrong with cuda. #elif CUDA_VERSION >= 9000 #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down_sync((mask), (var), (delta), (width)) -#define __ACTIVEMASK() __activemask() #define __SYNCWARP(Mask) __syncwarp(Mask) #else #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down((var), (delta), (width)) -#define __ACTIVEMASK() __ballot(1) // In Cuda < 9.0 no need to sync threads in warps. #define __SYNCWARP(Mask) #endif // CUDA_VERSION diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -15,6 +15,7 @@ #include #include "omptarget-nvptx.h" +#include "target_impl.h" EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid) {} @@ -64,11 +65,11 @@ uint32_t size, remote_id, physical_lane_id; physical_lane_id = GetThreadIdInBlock() % WARPSIZE; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); - uint32_t Liveness = __ACTIVEMASK(); + uint32_t Liveness = __kmpc_impl_activemask(); uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt)); do { - Liveness = __ACTIVEMASK(); + Liveness = __kmpc_impl_activemask(); remote_id = __ffs(Liveness & lanemask_gt); size = __popc(Liveness); logical_lane_id /= 2; @@ -83,7 +84,7 @@ size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - uint32_t Liveness = __ACTIVEMASK(); + uint32_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) { gpu_regular_warp_reduce(reduce_data, shflFct); return GetThreadIdInBlock() % WARPSIZE == @@ -144,7 +145,7 @@ } return BlockThreadId == 0; #else - uint32_t Liveness = __ACTIVEMASK(); + uint32_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes @@ -319,7 +320,7 @@ ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); // Reduce across warps to the warp master. - uint32_t Liveness = __ACTIVEMASK(); + uint32_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else // Partial warp but contiguous lanes diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -14,6 +14,8 @@ // Execution Parameters //////////////////////////////////////////////////////////////////////////////// +#include "target_impl.h" + INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { execution_param = EMode; execution_param |= RMode; @@ -202,7 +204,7 @@ // Parallel level INLINE void IncParallelLevel(bool ActiveParallel) { - unsigned Active = __ACTIVEMASK(); + unsigned Active = __kmpc_impl_activemask(); __SYNCWARP(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); @@ -216,7 +218,7 @@ } INLINE void DecParallelLevel(bool ActiveParallel) { - unsigned Active = __ACTIVEMASK(); + unsigned Active = __kmpc_impl_activemask(); __SYNCWARP(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -143,7 +143,7 @@ EXTERN int32_t __kmpc_warp_active_thread_mask() { PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); - return __ACTIVEMASK(); + return __kmpc_impl_activemask(); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -38,6 +38,19 @@ INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); } +#ifndef CUDA_VERSION +#error CUDA_VERSION macro is undefined, something wrong with cuda. +#endif + +// __ballot(1) in Cuda 8.0 is replaced with __activemask(). +INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { +#if CUDA_VERSION >= 9000 + return __activemask(); +#else + return __ballot(1); +#endif +} + INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); } #endif