Index: libomptarget/deviceRTLs/nvptx/src/cancel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/cancel.cu +++ libomptarget/deviceRTLs/nvptx/src/cancel.cu @@ -13,14 +13,14 @@ #include "omptarget-nvptx.h" -EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid, +EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal); // disabled return FALSE; } -EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid, +EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal); // disabled Index: libomptarget/deviceRTLs/nvptx/src/critical.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/critical.cu +++ libomptarget/deviceRTLs/nvptx/src/critical.cu @@ -15,14 +15,16 @@ #include "omptarget-nvptx.h" -EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid, - kmp_CriticalName *lck) { +EXTERN +void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *lck) { PRINT0(LD_IO, "call to kmpc_critical()\n"); omp_set_lock((omp_lock_t *)lck); } -EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid, - kmp_CriticalName *lck) { +EXTERN +void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *lck) { PRINT0(LD_IO, "call to kmpc_end_critical()\n"); omp_unset_lock((omp_lock_t *)lck); } Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -160,8 +160,36 @@ } kmp_sched_t; +/*! + * Enum for accesseing the reserved_2 field of the ident_t struct below. + */ +enum { + /*! Bit set to 1 when in SPMD mode. */ + KMP_IDENT_SPMD_MODE = 0x01, + /*! Bit set to 1 when a simplified runtime is used. */ + KMP_IDENT_SIMPLE_RT_MODE = 0x02, +}; + +/*! + * The ident structure that describes a source location. + * The struct is identical to the one in the kmp.h file. + * We maintain the same data structure for compatibility. + */ +typedef int kmp_int32; +typedef struct ident { + kmp_int32 reserved_1; /**< might be used in Fortran; see above */ + kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC + identifies this union member */ + kmp_int32 reserved_2; /**< not really used in Fortran any more; see above */ + kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++ */ + char const *psource; /**< String describing the source location. + The string is composed of semi-colon separated fields + which describe the source file, the function and a pair + of line numbers that delimit the construct. */ +} ident_t; + // parallel defs -typedef void kmp_Indent; +typedef ident_t kmp_Ident; typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...); typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData); typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num); @@ -223,28 +251,28 @@ //////////////////////////////////////////////////////////////////////////////// // query -EXTERN int32_t __kmpc_global_num_threads(kmp_Indent *loc); // missing -EXTERN int32_t __kmpc_bound_thread_num(kmp_Indent *loc); // missing -EXTERN int32_t __kmpc_bound_num_threads(kmp_Indent *loc); // missing -EXTERN int32_t __kmpc_in_parallel(kmp_Indent *loc); // missing +EXTERN int32_t __kmpc_global_num_threads(kmp_Ident *loc); // missing +EXTERN int32_t __kmpc_bound_thread_num(kmp_Ident *loc); // missing +EXTERN int32_t __kmpc_bound_num_threads(kmp_Ident *loc); // missing +EXTERN int32_t __kmpc_in_parallel(kmp_Ident *loc); // missing // parallel -EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc); -EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t global_tid, +EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc); +EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid, int32_t num_threads); // simd -EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid, int32_t simd_limit); // aee ... not supported -// EXTERN void __kmpc_fork_call(kmp_Indent *loc, int32_t argc, kmp_ParFctPtr +// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr // microtask, ...); -EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid); -EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, +EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); -EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid); +EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid); // proc bind -EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid, int proc_bind); EXTERN int omp_get_num_places(void); EXTERN int omp_get_place_num_procs(int place_num); @@ -254,52 +282,52 @@ EXTERN void omp_get_partition_place_nums(int *place_nums); // for static (no chunk or chunk) -EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); -EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); -EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); -EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); EXTERN -void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid, +void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); EXTERN -void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid, +void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); EXTERN -void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid, +void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); EXTERN -void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid, +void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); EXTERN -void __kmpc_for_static_init_4_simple_generic(kmp_Indent *loc, +void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int32_t *plower, int32_t *pupper, @@ -307,11 +335,11 @@ int32_t chunk); EXTERN void __kmpc_for_static_init_4u_simple_generic( - kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, + kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk); EXTERN -void __kmpc_for_static_init_8_simple_generic(kmp_Indent *loc, +void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, int64_t *plower, int64_t *pupper, @@ -319,48 +347,48 @@ int64_t chunk); EXTERN void __kmpc_for_static_init_8u_simple_generic( - kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, + kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk); -EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid); // for dynamic -EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t lower, int32_t upper, int32_t incr, int32_t chunk); -EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid, int32_t sched, uint32_t lower, uint32_t upper, int32_t incr, int32_t chunk); -EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid, int32_t sched, int64_t lower, int64_t upper, int64_t incr, int64_t chunk); -EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid, int32_t sched, uint64_t lower, uint64_t upper, int64_t incr, int64_t chunk); -EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t global_tid, +EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride); -EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t global_tid, +EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride); -EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t global_tid, +EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride); -EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t global_tid, +EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, int64_t *pstride); -EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid); // Support for reducing conditional lastprivate variables -EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, +EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t global_tid, int32_t varNum, void *array); @@ -395,63 +423,63 @@ EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); // sync barrier -EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid); -EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid); -EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid); -EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc, int32_t global_tid); +EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid); +EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid); +EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid); +EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid); // single -EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid); -EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid); +EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid); // sync -EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid); -EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid); -EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t global_tid); -EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t global_tid); -EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid, +EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, kmp_CriticalName *crit); -EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, kmp_CriticalName *crit); -EXTERN void __kmpc_flush(kmp_Indent *loc); +EXTERN void __kmpc_flush(kmp_Ident *loc); // vote EXTERN int32_t __kmpc_warp_active_thread_mask(); // tasks -EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Indent *loc, +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid, int32_t flag, size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable, kmp_TaskFctPtr sub); -EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid, +EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr); -EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList); -EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr); -EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr); -EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList); -EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid); -EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid); -EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid); +EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, int end_part); -EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid); -EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid, +EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr, int if_val, uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, int32_t sched, uint64_t grainsize, void *task_dup); // cancel -EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid, +EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal); -EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid, +EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal); // non standard Index: libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/loop.cu +++ libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -238,10 +238,10 @@ schedule <= kmp_sched_ordered_last; } - INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId, + INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId, kmp_sched_t schedule, T lb, T ub, ST st, ST chunk) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Expected non-SPMD mode + initialized runtime."); int tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); @@ -249,8 +249,9 @@ T tripCount = ub - lb + 1; // +1 because ub is inclusive ASSERT0( LT_FUSSY, - GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) < - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + GetOmpThreadId(tid, checkSPMDMode(loc), checkRuntimeUninitialized(loc)) < + GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), "current thread is not needed here; error"); /* Currently just ignore the monotonic and non-monotonic modifiers @@ -321,7 +322,8 @@ int lastiter = 0; ForStaticChunk( lastiter, lb, ub, stride, chunk, - GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum); + GetOmpThreadId(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), tnum); // save computed params omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; @@ -329,7 +331,8 @@ PRINT(LD_LOOP, "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), omptarget_nvptx_threadPrivateContext->Stride(tid)); @@ -350,7 +353,8 @@ T oldUb = ub; ForStaticChunk( lastiter, lb, ub, stride, chunk, - GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum); + GetOmpThreadId(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), tnum); ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); if (ub > oldUb) ub = oldUb; @@ -361,7 +365,8 @@ PRINT(LD_LOOP, "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), omptarget_nvptx_threadPrivateContext->Stride(tid)); @@ -376,7 +381,8 @@ int lastiter = 0; ForStaticNoChunk( lastiter, lb, ub, stride, chunk, - GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum); + GetOmpThreadId(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), tnum); // save computed params omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; @@ -384,7 +390,8 @@ PRINT(LD_LOOP, "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), omptarget_nvptx_threadPrivateContext->Stride(tid)); @@ -405,7 +412,8 @@ PRINT(LD_LOOP, "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 ", chunk %" PRIu64 "\n", - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId), omptarget_nvptx_threadPrivateContext->Chunk(teamId)); @@ -538,7 +546,7 @@ //////////////////////////////////////////////////////////////////////////////// // init -EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid, +EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid, int32_t schedule, int32_t lb, int32_t ub, int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); @@ -546,7 +554,7 @@ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } -EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid, +EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid, int32_t schedule, uint32_t lb, uint32_t ub, int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); @@ -554,7 +562,7 @@ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } -EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid, +EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid, int32_t schedule, int64_t lb, int64_t ub, int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); @@ -562,7 +570,7 @@ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } -EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid, +EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid, int32_t schedule, uint64_t lb, uint64_t ub, int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); @@ -571,14 +579,14 @@ } // next -EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid, int32_t *p_last, +EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); return omptarget_nvptx_LoopSupport::dispatch_next( p_last, p_lb, p_ub, p_st); } -EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t tid, +EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last, uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); @@ -586,14 +594,14 @@ p_last, p_lb, p_ub, p_st); } -EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t tid, int32_t *p_last, +EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); return omptarget_nvptx_LoopSupport::dispatch_next( p_last, p_lb, p_ub, p_st); } -EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t tid, +EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last, uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); @@ -602,22 +610,22 @@ } // fini -EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t tid) { +EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); } -EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t tid) { +EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); } -EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t tid) { +EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); } -EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t tid) { +EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); } @@ -626,52 +634,52 @@ // KMP interface implementation (static loops) //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); omptarget_nvptx_LoopSupport::for_static_init( - schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(), - isRuntimeUninitialized()); + schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); } -EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); omptarget_nvptx_LoopSupport::for_static_init( - schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(), - isRuntimeUninitialized()); + schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); } -EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); omptarget_nvptx_LoopSupport::for_static_init( - schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(), - isRuntimeUninitialized()); + schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); } -EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid, +EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); omptarget_nvptx_LoopSupport::for_static_init( - schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(), - isRuntimeUninitialized()); + schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); } EXTERN -void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid, +void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, @@ -684,7 +692,7 @@ } EXTERN -void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid, +void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, @@ -697,7 +705,7 @@ } EXTERN -void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid, +void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, @@ -710,7 +718,7 @@ } EXTERN -void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid, +void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, int64_t *pstride, @@ -724,7 +732,7 @@ EXTERN void __kmpc_for_static_init_4_simple_generic( - kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); @@ -736,7 +744,7 @@ EXTERN void __kmpc_for_static_init_4u_simple_generic( - kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); @@ -748,7 +756,7 @@ EXTERN void __kmpc_for_static_init_8_simple_generic( - kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); @@ -760,7 +768,7 @@ EXTERN void __kmpc_for_static_init_8u_simple_generic( - kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); @@ -770,7 +778,7 @@ /*IsRuntimeUninitialized=*/true); } -EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) { +EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_for_static_fini\n"); } @@ -792,17 +800,18 @@ } }; // namespace -EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid, +EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid, int32_t varNum, void *array) { PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n"); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Expected non-SPMD mode + initialized runtime."); omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); - int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(), - isRuntimeUninitialized()); + int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), checkSPMDMode(loc), + checkRuntimeUninitialized(loc)); uint32_t NumThreads = GetNumberOfOmpThreads( - GetLogicalThreadIdInBlock(), isSPMDMode(), isRuntimeUninitialized()); + GetLogicalThreadIdInBlock(), checkSPMDMode(loc), + checkRuntimeUninitialized(loc)); uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); for (unsigned i = 0; i < varNum; i++) { // Reset buffer. Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -332,11 +332,11 @@ // support for parallel that goes sequential //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { +EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), + if (checkRuntimeUninitialized(loc)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); omptarget_nvptx_simpleThreadPrivateContext->IncParLevel(); return; @@ -370,12 +370,12 @@ newTaskDescr); } -EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, +EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), + if (checkRuntimeUninitialized(loc)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); omptarget_nvptx_simpleThreadPrivateContext->DecParLevel(); return; @@ -393,11 +393,11 @@ currTaskDescr->RestoreLoopData(); } -EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) { +EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), + if (checkRuntimeUninitialized(loc)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); } @@ -417,7 +417,7 @@ // cached by the compiler and used when calling the runtime. On nvptx // it's cheap to recalculate this value so we never use the result // of this call. -EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) { +EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { return GetLogicalThreadIdInBlock(); } @@ -425,19 +425,19 @@ // push params //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid, +EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid, int32_t num_threads) { PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = num_threads; } -EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid, +EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid, int32_t simd_limit) { PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; } @@ -445,14 +445,14 @@ // Do nothing. The host guarantees we started the requested number of // teams and we only need inspection of gridDim. -EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid, +EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid, int32_t num_teams, int32_t thread_limit) { PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams); ASSERT0(LT_FUSSY, FALSE, "should never have anything with new teams on device"); } -EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid, +EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) { PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind); } Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -31,7 +31,7 @@ } EXTERN -int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, +int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) { @@ -40,7 +40,8 @@ int numthread; if (currTaskDescr->IsParallelConstruct()) { numthread = - GetNumberOfOmpThreads(threadId, isSPMDMode(), isRuntimeUninitialized()); + GetNumberOfOmpThreads(threadId, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)); } else { numthread = GetNumberOfOmpTeams(); } @@ -55,12 +56,12 @@ } EXTERN -int32_t __kmpc_reduce_combined(kmp_Indent *loc) { +int32_t __kmpc_reduce_combined(kmp_Ident *loc) { return threadIdx.x == 0 ? 2 : 0; } EXTERN -int32_t __kmpc_reduce_simd(kmp_Indent *loc) { +int32_t __kmpc_reduce_simd(kmp_Ident *loc) { return (threadIdx.x % 32 == 0) ? 1 : 0; } Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -33,6 +33,59 @@ } //////////////////////////////////////////////////////////////////////////////// +// Execution Modes based on location parameter fields +//////////////////////////////////////////////////////////////////////////////// + +INLINE bool checkSPMDMode(kmp_Ident *loc) { + if (!loc) + return isSPMDMode(); + + // If SPMD is true then we are not in the UNDEFINED state so + // we can return immediately. + if (loc->reserved_2 & KMP_IDENT_SPMD_MODE) + return true; + + // If not in SPMD mode and runtime required is a valid + // combination of flags so we can return immediately. + if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE)) + return false; + + // We are in underfined state. + return isSPMDMode(); +} + +INLINE bool checkGenericMode(kmp_Ident *loc) { + return !checkSPMDMode(loc); +} + +INLINE bool checkRuntimeUninitialized(kmp_Ident *loc) { + if (!loc) + return isRuntimeUninitialized(); + + // If runtime is required then we know we can't be + // in the undefined mode. We can return immediately. + if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE)) + return false; + + // If runtime is required then we need to check is in + // SPMD mode or not. If not in SPMD mode then we end + // up in the UNDEFINED state that marks the orphaned + // functions. + if (loc->reserved_2 & KMP_IDENT_SPMD_MODE) + return true; + + // Check if we are in an UNDEFINED state. Undefined is denoted by + // non-SPMD + noRuntimeRequired which is a combination that + // cannot actually happen. Undefined states is used to mark orphaned + // functions. + return isRuntimeUninitialized(); +} + +INLINE bool checkRuntimeInitialized(kmp_Ident *loc) { + return !checkRuntimeUninitialized(loc); +} + +//////////////////////////////////////////////////////////////////////////////// // support: get info from machine //////////////////////////////////////////////////////////////////////////////// @@ -78,8 +131,6 @@ // id is GetMasterThreadID()) calls this routine, we return 0 because // it is a shadow for the first worker. INLINE int GetLogicalThreadIdInBlock() { - // return GetThreadIdInBlock() % GetMasterThreadID(); - // Implemented using control flow (predication) instead of with a modulo // operation. int tid = GetThreadIdInBlock(); Index: libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/sync.cu +++ libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -17,11 +17,11 @@ // KMP Ordered calls //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t tid) { +EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_ordered\n"); } -EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) { +EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_end_ordered\n"); } @@ -33,16 +33,16 @@ // FIXME: what if not all threads (warps) participate to the barrier? // We may need to implement it differently -EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) { +EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) { PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); __kmpc_barrier(loc_ref, tid); PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); return 0; } -EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), +EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) { + if (checkRuntimeUninitialized(loc_ref)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref), "Expected SPMD mode with uninitialized runtime."); __kmpc_barrier_simple_spmd(loc_ref, tid); } else { @@ -50,9 +50,9 @@ omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); int numberOfActiveOMPThreads = GetNumberOfOmpThreads( - tid, isSPMDMode(), /*isRuntimeUninitialized=*/false); + tid, checkSPMDMode(loc_ref), /*isRuntimeUninitialized=*/false); if (numberOfActiveOMPThreads > 1) { - if (isSPMDMode()) { + if (checkSPMDMode(loc_ref)) { __kmpc_barrier_simple_spmd(loc_ref, tid); } else { // The #threads parameter must be rounded up to the WARPSIZE. @@ -72,7 +72,7 @@ // Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 // parallel region and that all worker threads participate. -EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid) { +EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) { PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n"); __syncthreads(); PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); @@ -80,7 +80,7 @@ // Emit a simple barrier call in Generic mode. Assumes the caller is in an L0 // parallel region and that all worker threads participate. -EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid) { +EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) { int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE; // The #threads parameter must be rounded up to the WARPSIZE. int threads = @@ -106,12 +106,12 @@ return IsTeamMaster(ompThreadId); } -EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid) { +EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_master\n"); return IsMaster(); } -EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid) { +EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_end_master\n"); ASSERT0(LT_FUSSY, IsMaster(), "expected only master here"); } @@ -120,13 +120,13 @@ // KMP SINGLE //////////////////////////////////////////////////////////////////////////////// -EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid) { +EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_single\n"); // decide to implement single with master; master get the single return IsMaster(); } -EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid) { +EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) { PRINT0(LD_IO, "call kmpc_end_single\n"); // decide to implement single with master: master get the single ASSERT0(LT_FUSSY, IsMaster(), "expected only master here"); @@ -137,7 +137,7 @@ // Flush //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_flush(kmp_Indent *loc) { +EXTERN void __kmpc_flush(kmp_Ident *loc) { PRINT0(LD_IO, "call kmpc_flush\n"); __threadfence_block(); } Index: libomptarget/deviceRTLs/nvptx/src/task.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/task.cu +++ libomptarget/deviceRTLs/nvptx/src/task.cu @@ -31,7 +31,7 @@ #include "omptarget-nvptx.h" EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc( - kmp_Indent *loc, // unused + kmp_Ident *loc, // unused uint32_t global_tid, // unused int32_t flag, // unused (because in our impl, all are immediately exec size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable, @@ -68,20 +68,20 @@ return newKmpTaskDescr; } -EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid, +EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr) { return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0); } -EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", P64(newKmpTaskDescr)); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); // 1. get explict task descr from kmp task descr omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( @@ -114,11 +114,11 @@ return 0; } -EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr) { PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", P64(newKmpTaskDescr)); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); // 1. get explict task descr from kmp task descr omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( @@ -139,11 +139,11 @@ // 4 & 5 ... done in complete } -EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr) { PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", P64(newKmpTaskDescr)); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); // 1. get explict task descr from kmp task descr omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( @@ -164,37 +164,37 @@ SafeFree(newExplicitTaskDescr, "explicit task descriptor"); } -EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n"); // nothing to do as all our tasks are executed as final } -EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid) { +EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n"); // nothing to do as all our tasks are executed as final } -EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid) { +EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n"); // nothing to do as all our tasks are executed as final } -EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid, +EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, int end_part) { PRINT0(LD_IO, "call to __kmpc_taskyield()\n"); // do nothing: tasks are executed immediately, no yielding allowed return 0; } -EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid) { +EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_taskwait()\n"); // nothing to do as all our tasks are executed as final return 0; } -EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid, +EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr, int if_val, uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, int32_t sched, uint64_t grainsize, void *task_dup) {