Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -43,8 +43,8 @@ return (unsigned)Sh == 0; } // Return true if this is the master thread. -__device__ static bool IsMasterThread() { - return !isSPMDMode() && getMasterThreadId() == getThreadId(); +__device__ static bool IsMasterThread(bool isSPMDExecutionMode) { + return !isSPMDExecutionMode && getMasterThreadId() == getThreadId(); } /// Return the provided size aligned to the size of a pointer. @@ -88,7 +88,8 @@ omptarget_nvptx_TeamDescr *teamDescr = &omptarget_nvptx_threadPrivateContext->TeamContext(); - __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread()); + __kmpc_data_sharing_slot *RootS = + teamDescr->RootS(WID, IsMasterThread(isSPMDMode())); DataSharingState.SlotPtr[WID] = RootS; DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; @@ -253,8 +254,9 @@ // The master thread cleans the saved slot, because this is an environment // only for the master. - __kmpc_data_sharing_slot *S = - IsMasterThread() ? *SavedSharedSlot : DataSharingState.SlotPtr[WID]; + __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode()) + ? *SavedSharedSlot + : DataSharingState.SlotPtr[WID]; if (S->Next) { free(S->Next); @@ -472,8 +474,9 @@ // space for the variables of each thread in the warp, // i.e. one DataSize chunk per warp lane. // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ? - DataSize : WARPSIZE * DataSize; + size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) + ? DataSize + : WARPSIZE * DataSize; // Compute the start address of the frame of each thread in the warp. uintptr_t FrameStartAddress = @@ -553,14 +556,15 @@ // manage statically allocated global memory. This memory is allocated by the // compiler and used to correctly implement globalization of the variables in // target, teams and distribute regions. -EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size, +EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + const void *buf, size_t size, int16_t is_shared, const void **frame) { if (is_shared) { *frame = buf; return; } - if (isSPMDMode()) { + if (isSPMDExecutionMode) { if (GetThreadIdInBlock() == 0) { *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); } @@ -574,10 +578,11 @@ __threadfence(); } -EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) { +EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared) { if (is_shared) return; - if (isSPMDMode()) { + if (isSPMDExecutionMode) { // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. __SYNCTHREADS(); if (GetThreadIdInBlock() == 0) { Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -395,9 +395,13 @@ // reduction EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait( +EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, + void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct); EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); @@ -550,9 +554,11 @@ // SPMD execution mode interrogation function. EXTERN int8_t __kmpc_is_spmd_exec_mode(); -EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size, +EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + const void *buf, size_t size, int16_t is_shared, const void **res); -EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared); +EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared); #endif Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -39,14 +39,17 @@ if (num <= 0) { WARNING0(LW_INPUT, "expected positive num; ignore\n"); } else { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); currTaskDescr->NThreads() = num; } } EXTERN int omp_get_num_threads(void) { - int tid = GetLogicalThreadIdInBlock(); - int rc = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int rc = + GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); return rc; } @@ -58,7 +61,8 @@ // We're already in parallel region. return 1; // default is 1 thread avail } - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); int rc = 1; // default is 1 thread avail if (!currTaskDescr->InParallelRegion()) { // Not currently in a parallel region, return what was set. @@ -76,21 +80,23 @@ return 0; // default is 0 } // per contention group.. meaning threads in current team - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); int rc = currTaskDescr->ThreadLimit(); PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); return rc; } EXTERN int omp_get_thread_num() { - int tid = GetLogicalThreadIdInBlock(); - int rc = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized()); PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); return rc; } EXTERN int omp_get_num_procs(void) { - int rc = GetNumberOfProcsInDevice(); + int rc = GetNumberOfProcsInDevice(isSPMDMode()); PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); return rc; } @@ -102,7 +108,8 @@ "Expected SPMD mode only with uninitialized runtime."); rc = 1; // SPMD mode is always in parallel. } else { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); if (currTaskDescr->InParallelRegion()) { rc = 1; } @@ -161,7 +168,8 @@ return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); } int level = 0; - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); do { @@ -181,7 +189,8 @@ return 1; } int level = 0; // no active level parallelism - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); do { @@ -208,7 +217,8 @@ } else if (level > 0) { int totLevel = omp_get_level(); if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); int steps = totLevel - level; PRINT(LD_IO, "backtrack %d steps\n", steps); ASSERT0(LT_FUSSY, currTaskDescr, @@ -259,7 +269,8 @@ } else if (level > 0) { int totLevel = omp_get_level(); if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); int steps = totLevel - level; ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); @@ -288,7 +299,8 @@ *kind = omp_sched_static; *modifier = 1; } else { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); *kind = currTaskDescr->GetRuntimeSched(); *modifier = currTaskDescr->RuntimeChunkSize(); } @@ -305,7 +317,8 @@ return; } if (kind >= omp_sched_static && kind < omp_sched_auto) { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); currTaskDescr->SetRuntimeSched(kind); currTaskDescr->RuntimeChunkSize() = modifier; PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n", Index: libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/loop.cu +++ libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -101,7 +101,7 @@ // When IsRuntimeUninitialized is true, we assume that the caller is // in an L0 parallel region and that all worker threads participate. - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode); // Assume we are in teams region or that we use a single block // per target region @@ -208,7 +208,7 @@ ST chunk) { ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Expected non-SPMD mode + initialized runtime."); - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); T tnum = currTaskDescr->ThreadsInTeam(); T tripCount = ub - lb + 1; // +1 because ub is inclusive @@ -417,17 +417,18 @@ // On Pascal, with inlining of the runtime into the user application, // this code deadlocks. This is probably because different threads // in a warp cannot make independent progress. - NOINLINE static int dispatch_next(int32_t gtid, int32_t *plast, T *plower, - T *pupper, ST *pstride) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), + NOINLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, + int32_t *plast, T *plower, T *pupper, + ST *pstride) { + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Expected non-SPMD mode + initialized runtime."); // ID of a thread in its own warp // automatically selects thread or warp ID based on selected implementation - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); ASSERT0(LT_FUSSY, - gtid < GetNumberOfOmpThreads(tid, isSPMDMode(), - isRuntimeUninitialized()), + gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), "current thread is not needed here; error"); // retrieve schedule kmp_sched_t schedule = @@ -540,7 +541,7 @@ int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); return omptarget_nvptx_LoopSupport::dispatch_next( - tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st); } EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, @@ -548,14 +549,14 @@ uint32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); return omptarget_nvptx_LoopSupport::dispatch_next( - tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st); } EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); return omptarget_nvptx_LoopSupport::dispatch_next( - tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st); } EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, @@ -563,7 +564,7 @@ uint64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); return omptarget_nvptx_LoopSupport::dispatch_next( - tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st); } // fini @@ -756,7 +757,7 @@ "Expected non-SPMD mode + initialized runtime."); omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc), checkRuntimeUninitialized(loc)); uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -176,7 +176,7 @@ prev = taskDescr; } // init & copy - INLINE void InitLevelZeroTaskDescr(); + INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode); INLINE void InitLevelOneTaskDescr(uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr); INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); @@ -257,7 +257,7 @@ INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } // init - INLINE void InitTeamDescr(); + INLINE void InitTeamDescr(bool isSPMDExecutionMode); INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { // If this is invoked by the master thread of the master warp then intialize @@ -462,7 +462,8 @@ INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); +INLINE omptarget_nvptx_TaskDescr * +getMyTopTaskDescriptor(bool isSPMDExecutionMode); INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -61,12 +61,12 @@ omptarget_nvptx_device_State[slot].Dequeue(); // init thread private - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false); omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); // init team context omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - currTeamDescr.InitTeamDescr(); + currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false); // this thread will start execution... has to update its task ICV // to point to the level zero task ICV. That ICV was init in // InitTeamDescr() @@ -128,7 +128,7 @@ omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); // init team context - currTeamDescr.InitTeamDescr(); + currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true); } // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. __SYNCTHREADS(); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -31,7 +31,8 @@ items.flags |= val; } -INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { +INLINE void +omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) { // slow method // flag: // default sched is static, @@ -39,7 +40,7 @@ // not in parallel items.flags = 0; - items.nthreads = GetNumberOfProcsInTeam(); + items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode); ; // threads: whatever was alloc by kernel items.threadId = 0; // is master items.threadsInTeam = 1; // sequential @@ -177,8 +178,8 @@ // Team Descriptor //////////////////////////////////////////////////////////////////////////////// -INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() { - levelZeroTaskDescr.InitLevelZeroTaskDescr(); +INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) { + levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode); } //////////////////////////////////////////////////////////////////////////////// @@ -199,8 +200,9 @@ return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); } -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() { - return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock()); +INLINE omptarget_nvptx_TaskDescr * +getMyTopTaskDescriptor(bool isSPMDExecutionMode) { + return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode)); } //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -57,7 +57,7 @@ asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); *LaneId = __popc(ConvergentMask & lanemask_lt); - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; @@ -101,7 +101,7 @@ EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); // pop stack - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = job->slimForNextSimd; @@ -131,7 +131,7 @@ asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); uint32_t OmpId = __popc(ConvergentMask & lanemask_lt); - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; @@ -181,7 +181,7 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); // pop stack - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, job->convHeadTaskDescr); @@ -345,7 +345,7 @@ } // assume this is only called for nested parallel - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); // unlike actual parallel, threads in the same team do not share // the workTaskDescr in this case and num threads is fixed to 1 @@ -384,7 +384,7 @@ } // pop stack - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); // set new top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( @@ -404,7 +404,7 @@ return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); } - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); if (currTaskDescr->InL2OrHigherParallelRegion()) @@ -420,7 +420,7 @@ // it's cheap to recalculate this value so we never use the result // of this call. EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); return GetOmpThreadId(tid, checkSPMDMode(loc), checkRuntimeUninitialized(loc)); } @@ -433,7 +433,7 @@ int32_t num_threads) { PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = num_threads; } @@ -442,7 +442,7 @@ int32_t simd_limit) { PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit); ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; } Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -20,8 +20,10 @@ // may eventually remove this EXTERN int32_t __gpu_block_reduce() { - int tid = GetLogicalThreadIdInBlock(); - int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int nt = + GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); if (nt != blockDim.x) return 0; unsigned tnum = __ACTIVEMASK(); @@ -35,7 +37,7 @@ size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) { - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); int numthread; if (currTaskDescr->IsParallelConstruct()) { @@ -150,7 +152,7 @@ kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized) { - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); uint32_t NumThreads = GetNumberOfOmpThreads( BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); if (NumThreads == 1) @@ -236,8 +238,7 @@ #endif // __CUDA_ARCH__ >= 700 } -EXTERN -int32_t __kmpc_nvptx_parallel_reduce_nowait( +EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { return nvptx_parallel_reduce_nowait( @@ -247,6 +248,16 @@ } EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, + void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); +} + +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { @@ -272,7 +283,7 @@ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized) { - uint32_t ThreadId = GetLogicalThreadIdInBlock(); + uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); // In non-generic mode all workers participate in the teams reduction. // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/support.h +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -43,7 +43,7 @@ INLINE int GetNumberOfThreadsInBlock(); // get global ids to locate tread/team info (constant regardless of OMP) -INLINE int GetLogicalThreadIdInBlock(); +INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); INLINE int GetMasterThreadID(); INLINE int GetNumberOfWorkersInTeam(); @@ -59,8 +59,8 @@ INLINE int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs -INLINE int GetNumberOfProcsInTeam(); -INLINE int GetNumberOfProcsInDevice(); +INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); +INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); // masters INLINE int IsTeamMaster(int ompThreadId); Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -130,11 +130,11 @@ // or a serial region by the master. If the master (whose CUDA thread // id is GetMasterThreadID()) calls this routine, we return 0 because // it is a shadow for the first worker. -INLINE int GetLogicalThreadIdInBlock() { +INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) { // Implemented using control flow (predication) instead of with a modulo // operation. int tid = GetThreadIdInBlock(); - if (isGenericMode() && tid >= GetMasterThreadID()) + if (!isSPMDExecutionMode && tid >= GetMasterThreadID()) return 0; else return tid; @@ -214,13 +214,15 @@ // get OpenMP number of procs // Get the number of processors in the device. -INLINE int GetNumberOfProcsInDevice() { - if (isGenericMode()) +INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { + if (!isSPMDExecutionMode) return GetNumberOfWorkersInTeam(); return GetNumberOfThreadsInBlock(); } -INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); } +INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { + return GetNumberOfProcsInDevice(isSPMDExecutionMode); +} //////////////////////////////////////////////////////////////////////////////// // Memory Index: libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/sync.cu +++ libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -46,7 +46,7 @@ "Expected SPMD mode with uninitialized runtime."); __kmpc_barrier_simple_spmd(loc_ref, tid); } else { - tid = GetLogicalThreadIdInBlock(); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); int numberOfActiveOMPThreads = GetNumberOfOmpThreads( Index: libomptarget/deviceRTLs/nvptx/src/task.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/task.cu +++ libomptarget/deviceRTLs/nvptx/src/task.cu @@ -96,7 +96,7 @@ "bad assumptions"); // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); newTaskDescr->CopyForExplicitTask(parentTaskDescr); // set new task descriptor as top @@ -135,7 +135,7 @@ "bad assumptions"); // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); newTaskDescr->CopyForExplicitTask(parentTaskDescr); // set new task descriptor as top @@ -163,7 +163,7 @@ omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); // 3... noting to call... is inline // 4. pop context - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, parentTaskDescr); // 5. free