Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -13,42 +13,26 @@ #include "omptarget-nvptx.h" #include -// Number of threads in the CUDA block. -__device__ static unsigned getNumThreads() { return blockDim.x; } -// Thread ID in the CUDA block -__device__ static unsigned getThreadId() { return threadIdx.x; } // Warp ID in the CUDA block -__device__ static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } +INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } // Lane ID in the CUDA warp. -__device__ static unsigned getLaneId() { return threadIdx.x % WARPSIZE; } - -// The CUDA thread ID of the master thread. -__device__ static unsigned getMasterThreadId() { - unsigned Mask = WARPSIZE - 1; - return (getNumThreads() - 1) & (~Mask); -} - -// Find the active threads in the warp - return a mask whose n-th bit is set if -// the n-th thread in the warp is active. -__device__ static unsigned getActiveThreadsMask() { - return __BALLOT_SYNC(0xFFFFFFFF, true); -} +INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; } // Return true if this is the first active thread in the warp. -__device__ static bool IsWarpMasterActiveThread() { - unsigned long long Mask = getActiveThreadsMask(); - unsigned long long ShNum = WARPSIZE - (getThreadId() % WARPSIZE); +INLINE static bool IsWarpMasterActiveThread() { + unsigned long long Mask = __ACTIVEMASK(); + unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); unsigned long long Sh = Mask << ShNum; // Truncate Sh to the 32 lower bits return (unsigned)Sh == 0; } // Return true if this is the master thread. -__device__ static bool IsMasterThread(bool isSPMDExecutionMode) { - return !isSPMDExecutionMode && getMasterThreadId() == getThreadId(); +INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { + return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); } /// Return the provided size aligned to the size of a pointer. -__device__ static size_t AlignVal(size_t Val) { +INLINE static size_t AlignVal(size_t Val) { const size_t Align = (size_t)sizeof(void *); if (Val & (Align - 1)) { Val += Align; @@ -128,7 +112,7 @@ (unsigned long long)SharingDefaultDataSize); unsigned WID = getWarpId(); - unsigned CurActiveThreads = getActiveThreadsMask(); + unsigned CurActiveThreads = __ACTIVEMASK(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; @@ -268,7 +252,7 @@ return; } - int32_t CurActive = getActiveThreadsMask(); + int32_t CurActive = __ACTIVEMASK(); // Only the warp master can restore the stack and frame information, and only // if there are no other threads left behind in this environment (i.e. the @@ -341,7 +325,7 @@ // Runtime functions for trunk data sharing scheme. //////////////////////////////////////////////////////////////////////////////// -INLINE void data_sharing_init_stack_common() { +INLINE static void data_sharing_init_stack_common() { ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); omptarget_nvptx_TeamDescr *teamDescr = &omptarget_nvptx_threadPrivateContext->TeamContext(); @@ -380,11 +364,11 @@ __threadfence_block(); } -INLINE void* data_sharing_push_stack_common(size_t PushSize) { +INLINE static void* data_sharing_push_stack_common(size_t PushSize) { ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); // Only warp active master threads manage the stack. - bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0; + bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0; // Add worst-case padding to DataSize so that future stack allocations are // correctly aligned. @@ -394,7 +378,7 @@ // Frame pointer must be visible to all workers in the same warp. const unsigned WID = getWarpId(); void *FrameP = 0; - const int32_t CurActive = getActiveThreadsMask(); + int32_t CurActive = __ACTIVEMASK(); if (IsWarpMaster) { // SlotP will point to either the shared memory slot or an existing @@ -454,8 +438,8 @@ return FrameP; } -EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize, - int16_t UseSharedMemory) { +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t UseSharedMemory) { return data_sharing_push_stack_common(DataSize); } @@ -466,8 +450,8 @@ // By default the globalized variables are stored in global memory. If the // UseSharedMemory is set to true, the runtime will attempt to use shared memory // as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, - int16_t UseSharedMemory) { +EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, + int16_t UseSharedMemory) { // Compute the total memory footprint of the requested data. // The master thread requires a stack only for itself. A worker // thread (which at this point is a warp master) will require @@ -495,7 +479,7 @@ __threadfence_block(); - if (getThreadId() % WARPSIZE == 0) { + if (GetThreadIdInBlock() % WARPSIZE == 0) { unsigned WID = getWarpId(); // Current slot @@ -572,7 +556,7 @@ __SYNCTHREADS(); return; } - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(), + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "Must be called only in the target master thread."); *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); __threadfence(); @@ -591,7 +575,7 @@ return; } __threadfence(); - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(), + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "Must be called only in the target master thread."); omptarget_nvptx_simpleMemoryManager.Release(); } Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h @@ -130,7 +130,7 @@ #include "option.h" template -static NOINLINE void log(const char *fmt, Arguments... parameters) { +NOINLINE static void log(const char *fmt, Arguments... parameters) { printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F), parameters...); } Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -414,12 +414,8 @@ return FINISHED; } - // On Pascal, with inlining of the runtime into the user application, - // this code deadlocks. This is probably because different threads - // in a warp cannot make independent progress. - NOINLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, - int32_t *plast, T *plower, T *pupper, - ST *pstride) { + INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast, + T *plower, T *pupper, ST *pstride) { ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Expected non-SPMD mode + initialized runtime."); // ID of a thread in its own warp Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -53,13 +53,11 @@ #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down_sync((mask), (var), (delta), (width)) -#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate)) #define __ACTIVEMASK() __activemask() #else #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down((var), (delta), (width)) -#define __BALLOT_SYNC(mask, predicate) __ballot((predicate)) #define __ACTIVEMASK() __ballot(1) #endif @@ -93,7 +91,7 @@ } } // Called by all threads. - INLINE void **GetArgs() { return args; }; + INLINE void **GetArgs() const { return args; }; private: // buffer of pre-allocated arguments. void *buffer[MAX_SHARED_ARGS]; @@ -104,7 +102,8 @@ uint32_t nArgs; }; -extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +extern __device__ __shared__ omptarget_nvptx_SharedArgs + omptarget_nvptx_globalArgs; // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { @@ -155,23 +154,23 @@ class omptarget_nvptx_TaskDescr { public: // methods for flags - INLINE omp_sched_t GetRuntimeSched(); + INLINE omp_sched_t GetRuntimeSched() const; INLINE void SetRuntimeSched(omp_sched_t sched); - INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; } - INLINE int InL2OrHigherParallelRegion() { + INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } + INLINE int InL2OrHigherParallelRegion() const { return items.flags & TaskDescr_InParL2P; } - INLINE int IsParallelConstruct() { + INLINE int IsParallelConstruct() const { return items.flags & TaskDescr_IsParConstr; } - INLINE int IsTaskConstruct() { return !IsParallelConstruct(); } + INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } // methods for other fields INLINE uint16_t &NThreads() { return items.nthreads; } INLINE uint16_t &ThreadLimit() { return items.threadlimit; } INLINE uint16_t &ThreadId() { return items.threadId; } INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } - INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; } + INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { prev = taskDescr; } @@ -326,7 +325,7 @@ omptarget_nvptx_TaskDescr *taskICV) { topTaskDescr[tid] = taskICV; } - INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid); + INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; // parallel INLINE uint16_t &NumThreadsForNextParallel(int tid) { return nextRegion.tnum[tid]; @@ -381,7 +380,7 @@ volatile unsigned keys[OMP_STATE_COUNT]; } MemData[MAX_SM]; - INLINE uint32_t hash(unsigned key) const { + INLINE static uint32_t hash(unsigned key) { return key & (OMP_STATE_COUNT - 1); } Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -29,7 +29,7 @@ // init entry points //////////////////////////////////////////////////////////////////////////////// -INLINE unsigned smid() { +INLINE static unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); return id; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -16,7 +16,7 @@ // Task Descriptor //////////////////////////////////////////////////////////////////////////////// -INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() { +INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const { // sched starts from 1..4; encode it as 0..3; so add 1 here uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1; return (omp_sched_t)rc; @@ -155,7 +155,7 @@ //////////////////////////////////////////////////////////////////////////////// INLINE omptarget_nvptx_TaskDescr * -omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) { +omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const { ASSERT0( LT_FUSSY, tid < MAX_THREADS_PER_TEAM, "Getting top level, tid is larger than allocated data structure size"); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -193,7 +193,7 @@ // support for parallel that goes parallel (1 static level only) //////////////////////////////////////////////////////////////////////////////// -static INLINE uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, +INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, uint16_t NThreadsICV, uint16_t ThreadLimit) { uint16_t ThreadsRequested = NThreadsICV; @@ -236,7 +236,7 @@ // This routine is only called by the team master. The team master is // the first thread of the last warp. It always has the logical thread // id of 0 (since it is a shadow for the first worker thread). - int threadId = 0; + const int threadId = 0; omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -86,7 +86,7 @@ return val; } -static INLINE void gpu_regular_warp_reduce(void *reduce_data, +INLINE static void gpu_regular_warp_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { shflFct(reduce_data, /*LaneId - not used= */ 0, @@ -94,7 +94,7 @@ } } -static INLINE void gpu_irregular_warp_reduce(void *reduce_data, +INLINE static void gpu_irregular_warp_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct, uint32_t size, uint32_t tid) { uint32_t curr_size; @@ -108,18 +108,18 @@ } } -static INLINE uint32_t +INLINE static uint32_t gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { uint32_t lanemask_lt; uint32_t lanemask_gt; uint32_t size, remote_id, physical_lane_id; physical_lane_id = GetThreadIdInBlock() % WARPSIZE; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); - uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + uint32_t Liveness = __ACTIVEMASK(); uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt)); do { - Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + Liveness = __ACTIVEMASK(); remote_id = __ffs(Liveness & lanemask_gt); size = __popc(Liveness); logical_lane_id /= 2; @@ -134,7 +134,7 @@ size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + uint32_t Liveness = __ACTIVEMASK(); if (Liveness == 0xffffffff) { gpu_regular_warp_reduce(reduce_data, shflFct); return GetThreadIdInBlock() % WARPSIZE == @@ -146,12 +146,10 @@ } INLINE -int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, - size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, - bool isSPMDExecutionMode, - bool isRuntimeUninitialized) { +static int32_t nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + bool isSPMDExecutionMode, bool isRuntimeUninitialized) { uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); uint32_t NumThreads = GetNumberOfOmpThreads( BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); @@ -195,12 +193,10 @@ if (WarpId == 0) gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, BlockThreadId); - - return BlockThreadId == 0; } return BlockThreadId == 0; #else - uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + uint32_t Liveness = __ACTIVEMASK(); if (Liveness == 0xffffffff) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes @@ -278,7 +274,7 @@ } INLINE -int32_t nvptx_teams_reduce_nowait( +static int32_t nvptx_teams_reduce_nowait( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct, @@ -378,7 +374,7 @@ ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); // Reduce across warps to the warp master. - uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); + uint32_t Liveness = __ACTIVEMASK(); if (Liveness == 0xffffffff) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else // Partial warp but contiguous lanes Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h @@ -35,14 +35,14 @@ static const uint32_t MAX_ID = (1u << 31) / SIZE / 2; INLINE uint32_t ENQUEUE_TICKET(); INLINE uint32_t DEQUEUE_TICKET(); - INLINE uint32_t ID(uint32_t ticket); + INLINE static uint32_t ID(uint32_t ticket); INLINE bool IsServing(uint32_t slot, uint32_t id); INLINE void PushElement(uint32_t slot, ElementType *element); INLINE ElementType *PopElement(uint32_t slot); INLINE void DoneServing(uint32_t slot, uint32_t id); public: - INLINE omptarget_nvptx_Queue(){}; + INLINE omptarget_nvptx_Queue() {} INLINE void Enqueue(ElementType *element); INLINE ElementType *Dequeue(); }; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h @@ -30,7 +30,8 @@ } template -INLINE uint32_t omptarget_nvptx_Queue::ID(uint32_t ticket) { +INLINE uint32_t +omptarget_nvptx_Queue::ID(uint32_t ticket) { return (ticket / SIZE) * 2; }