diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -20,7 +20,7 @@ // Return true if this is the first active thread in the warp. INLINE static bool IsWarpMasterActiveThread() { - unsigned long long Mask = __ACTIVEMASK(); + __kmpc_impl_lanemask_t Mask = __kmpc_impl_activemask(); unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); unsigned long long Sh = Mask << ShNum; // Truncate Sh to the 32 lower bits @@ -112,7 +112,7 @@ (unsigned long long)SharingDefaultDataSize); unsigned WID = getWarpId(); - unsigned CurActiveThreads = __ACTIVEMASK(); + __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; @@ -252,7 +252,7 @@ return; } - int32_t CurActive = __ACTIVEMASK(); + __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); // Only the warp master can restore the stack and frame information, and only // if there are no other threads left behind in this environment (i.e. the @@ -378,7 +378,7 @@ // Frame pointer must be visible to all workers in the same warp. const unsigned WID = getWarpId(); void *FrameP = 0; - int32_t CurActive = __ACTIVEMASK(); + __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); if (IsWarpMaster) { // SlotP will point to either the shared memory slot or an existing diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -389,7 +389,7 @@ } INLINE static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = __ACTIVEMASK(); + __kmpc_impl_lanemask_t active = __kmpc_impl_activemask(); uint32_t leader = __kmpc_impl_ffs(active) - 1; uint32_t change = __kmpc_impl_popc(active); __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -45,17 +45,6 @@ #define BARRIER_COUNTER 0 #define ORDERED_COUNTER 1 -// Macros for Cuda intrinsics -// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. -// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). -#ifndef CUDA_VERSION -#error CUDA_VERSION macro is undefined, something wrong with cuda. -#elif CUDA_VERSION >= 9000 -#define __ACTIVEMASK() __activemask() -#else -#define __ACTIVEMASK() __ballot(1) -#endif // CUDA_VERSION - // arguments needed for L0 parallelism only. class omptarget_nvptx_SharedArgs { public: diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -53,7 +53,7 @@ uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); *LaneSource += __kmpc_impl_ffs(WorkRemaining); *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; - uint32_t lanemask_lt = __kmpc_impl_lanemask_lt(); + __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); @@ -126,7 +126,7 @@ uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); *LaneSource += __kmpc_impl_ffs(WorkRemaining); *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; - uint32_t lanemask_lt = __kmpc_impl_lanemask_lt(); + __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -61,12 +61,12 @@ gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { uint32_t size, remote_id, physical_lane_id; physical_lane_id = GetThreadIdInBlock() % WARPSIZE; - uint32_t lanemask_lt = __kmpc_impl_lanemask_lt(); - uint32_t Liveness = __ACTIVEMASK(); + __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); + __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2; - uint32_t lanemask_gt = __kmpc_impl_lanemask_gt(); + __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt(); do { - Liveness = __ACTIVEMASK(); + Liveness = __kmpc_impl_activemask(); remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt); size = __kmpc_impl_popc(Liveness); logical_lane_id /= 2; @@ -81,7 +81,7 @@ size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - uint32_t Liveness = __ACTIVEMASK(); + __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) { gpu_regular_warp_reduce(reduce_data, shflFct); return GetThreadIdInBlock() % WARPSIZE == @@ -142,7 +142,7 @@ } return BlockThreadId == 0; #else - uint32_t Liveness = __ACTIVEMASK(); + __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes @@ -317,7 +317,7 @@ ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); // Reduce across warps to the warp master. - uint32_t Liveness = __ACTIVEMASK(); + __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); if (Liveness == 0xffffffff) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else // Partial warp but contiguous lanes diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -204,9 +204,9 @@ // Parallel level INLINE void IncParallelLevel(bool ActiveParallel) { - unsigned Active = __ACTIVEMASK(); + __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask(); __kmpc_impl_syncwarp(Active); - unsigned LaneMaskLt = __kmpc_impl_lanemask_lt(); + __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt); if (Rank == 0) { parallelLevel[GetWarpId()] += @@ -217,9 +217,9 @@ } INLINE void DecParallelLevel(bool ActiveParallel) { - unsigned Active = __ACTIVEMASK(); + __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask(); __kmpc_impl_syncwarp(Active); - unsigned LaneMaskLt = __kmpc_impl_lanemask_lt(); + __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt); if (Rank == 0) { parallelLevel[GetWarpId()] -= diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -142,7 +142,7 @@ EXTERN int32_t __kmpc_warp_active_thread_mask() { PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); - return __ACTIVEMASK(); + return __kmpc_impl_activemask(); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -48,6 +48,16 @@ #error CUDA_VERSION macro is undefined, something wrong with cuda. #endif +// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). + +INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { +#if CUDA_VERSION >= 9000 + return __activemask(); +#else + return __ballot(1); +#endif +} + // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,