diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt --- a/openmp/runtime/CMakeLists.txt +++ b/openmp/runtime/CMakeLists.txt @@ -288,6 +288,13 @@ set (LIBOMP_USE_VERSION_SYMBOLS FALSE) endif() +# Unshackled task support defaults to OFF +set(LIBOMP_USE_UNSHACKLED_TASK FALSE CACHE BOOL "Use unshackled task?") +if(WIN32) + message(STATUS "Unshackled thread is not supported on Windows now. - forcing off") + set(LIBOMP_USE_UNSHACKLED_TASK FALSE) +endif() + # OMPT-support defaults to ON for OpenMP 5.0+ and if the requirements in # cmake/config-ix.cmake are fulfilled. set(OMPT_DEFAULT FALSE) diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2236,7 +2236,12 @@ unsigned priority_specified : 1; /* set if the compiler provides priority setting for the task */ unsigned detachable : 1; /* 1 == can detach */ +#if USE_UNSHACKLED_TASK + unsigned unshackled : 1; /* 1 == unshackled task */ + unsigned reserved : 8; /* reserved for compiler use */ +#else unsigned reserved : 9; /* reserved for compiler use */ +#endif /* Library flags */ /* Total library flags must be 16 bits */ unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */ @@ -2284,6 +2289,23 @@ kmp_depnode_t *td_depnode; // Pointer to graph node if this task has dependencies kmp_task_team_t *td_task_team; +#if USE_UNSHACKLED_TASK + // The task team of its parent task team. Usually we could access it via + // parent_task->td_task_team, but it is possible that + // parent_task->td_task_team is nullptr because of late initialization. + // Sometimes We must use this pointer, and the td_task_team of the + // encountering thread is never nullptr, therefore we it when this task is + // created. + kmp_task_team_t *td_parent_task_team; + // The global thread id of the encountering thread. We need it because when a + // regular task depends on an unshackled task, and the unshackled task is + // finished on an unshackled thread, it will call __kmp_release_deps to + // release all dependences. If now the task is a regular task, we need to pass + // the encountering gtid such that the task will be picked up and executed by + // its encountering team instead of unshackled team. + kmp_int32 encountering_gtid; + +#endif kmp_int32 td_size_alloc; // The size of task structure, including shareds etc. #if defined(KMP_GOMP_COMPAT) // 4 or 8 byte integers for the loop bounds in GOMP_taskloop @@ -2351,10 +2373,20 @@ kmp_int32 tt_max_threads; // # entries allocated for threads_data array kmp_int32 tt_found_proxy_tasks; // found proxy tasks since last barrier kmp_int32 tt_untied_task_encountered; +#if USE_UNSHACKLED_TASK + // There is unshackled thread encountered in this task team so that we must + // wait when waiting on task team + kmp_int32 tt_unshackled_task_encountered; +#endif KMP_ALIGN_CACHE std::atomic tt_unfinished_threads; /* #threads still active */ +#if USE_UNSHACKLED_TASK + KMP_ALIGN_CACHE + std::atomic tt_unfinished_unshackled_tasks; +#endif + KMP_ALIGN_CACHE volatile kmp_uint32 tt_active; /* is the team still actively executing tasks */ @@ -2819,6 +2851,9 @@ extern volatile int __kmp_init_monitor; #endif extern volatile int __kmp_init_user_locks; +#if USE_UNSHACKLED_TASK +extern volatile int __kmp_init_unshackled_threads; +#endif extern int __kmp_init_counter; extern int __kmp_root_counter; extern int __kmp_version; @@ -3905,6 +3940,40 @@ extern void __kmp_omp_display_env(int verbose); +#if USE_UNSHACKLED_TASK +// 1: it is initializing unshackled team +extern volatile int __kmp_init_unshackled; +// Master thread of unshackled team +extern kmp_info_t *__kmp_unshackled_master_thread; +// Descriptors for the unshackled threads +extern kmp_info_t **__kmp_unshackled_threads; +// Number of unshackled threads +extern int __kmp_unshackled_threads_num; +// Number of unshackled tasks that have not been executed yet +extern std::atomic __kmp_unexecuted_unshackled_tasks; + +extern void __kmp_unshackled_initialize(); +extern void __kmp_unshackled_threads_initz_routine(); +extern void __kmp_do_initialize_unshackled_threads(); +extern void __kmp_unshackled_threads_initz_wait(); +extern void __kmp_unshackled_initz_release(); +extern void __kmp_unshackled_master_thread_wait(); +extern void __kmp_unshackled_worker_thread_wait(); +extern void __kmp_unshackled_worker_thread_signal(); + +// Check whether a given thread is an unshackled thread +#define KMP_UNSHACKLED_THREAD(gtid) \ + ((gtid) >= 1 && (gtid) <= __kmp_unshackled_threads_num) + +#define KMP_UNSHACKLED_WORKER_THREAD(gtid) \ + ((gtid) > 1 && (gtid) <= __kmp_unshackled_threads_num) + +// Map a gtid to an unshackled thread. The first unshackled thread, a.k.a master +// thread, is skipped. +#define KMP_GTID_TO_SHADOW_GTID(gtid) \ + ((gtid) % (__kmp_unshackled_threads_num - 1) + 2) +#endif + #ifdef __cplusplus } #endif diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake --- a/openmp/runtime/src/kmp_config.h.cmake +++ b/openmp/runtime/src/kmp_config.h.cmake @@ -44,6 +44,8 @@ #define OMPT_DEBUG LIBOMP_OMPT_DEBUG #cmakedefine01 LIBOMP_OMPT_SUPPORT #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT +#cmakedefine01 LIBOMP_USE_UNSHACKLED_TASK +#define USE_UNSHACKLED_TASK LIBOMP_USE_UNSHACKLED_TASK #cmakedefine01 LIBOMP_OMPT_OPTIONAL #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -46,11 +46,17 @@ volatile int __kmp_init_common = FALSE; volatile int __kmp_init_middle = FALSE; volatile int __kmp_init_parallel = FALSE; +#if USE_UNSHACKLED_TASK +volatile int __kmp_init_unshackled = FALSE; +#endif #if KMP_USE_MONITOR volatile int __kmp_init_monitor = 0; /* 1 - launched, 2 - actually started (Windows* OS only) */ #endif volatile int __kmp_init_user_locks = FALSE; +#if USE_UNSHACKLED_TASK +volatile int __kmp_init_unshackled_threads = FALSE; +#endif /* list of address of allocated caches for commons */ kmp_cached_addr_t *__kmp_threadpriv_cache_list = NULL; diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -3630,6 +3630,39 @@ } } +#if USE_UNSHACKLED_TASK + // When unshackled task is enabled, __kmp_threads is organized as follows: + // 0: initial thread, also a regular OpenMP thread. + // [1, __kmp_unshackled_threads_num]: slots for unshackled threads. + // [__kmp_unshackled_threads_num + 1, __kmp_threads_capacity): slots for + // regular OpenMP threads. + if (TCR_4(__kmp_init_unshackled_threads)) { + // Find an available thread slot for unshackled thread. Slots for unshackled + // threads start from 1 to __kmp_unshackled_threads_num. + for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && + gtid <= __kmp_unshackled_threads_num; + gtid++) + ; + KMP_ASSERT(gtid <= __kmp_unshackled_threads_num); + KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " + "unshackled thread: T#%d\n", + gtid)); + } else { + /* find an available thread slot */ + // Don't reassign the zero slot since we need that to only be used by + // initial thread. Slots for unshackled threads should also be skipped. + if (initial_thread && __kmp_threads[0] == NULL) { + gtid = 0; + } else { + for (gtid = __kmp_unshackled_threads_num + 1; + TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) + ; + } + KA_TRACE( + 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); + KMP_ASSERT(gtid < __kmp_threads_capacity); + } +#else /* find an available thread slot */ /* Don't reassign the zero slot since we need that to only be used by initial thread */ @@ -3639,6 +3672,7 @@ KA_TRACE(1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); KMP_ASSERT(gtid < __kmp_threads_capacity); +#endif /* update global accounting */ __kmp_all_nth++; @@ -4295,9 +4329,22 @@ #endif KMP_MB(); + +#if USE_UNSHACKLED_TASK + for (new_gtid = TCR_4(__kmp_init_unshackled_threads) + ? 1 + : __kmp_unshackled_threads_num + 1; + TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { + KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); + } + if (TCR_4(__kmp_init_unshackled_threads)) { + KMP_DEBUG_ASSERT(new_gtid <= __kmp_unshackled_threads_num); + } +#else for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); } +#endif /* allocate space for it. */ new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); @@ -6693,6 +6740,11 @@ __kmp_env_free(&val); #endif +#if USE_UNSHACKLED_TASK + // TODO: Initialize this value from an env + __kmp_unshackled_threads_num = 8; +#endif + __kmp_threads_capacity = __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part @@ -6992,6 +7044,45 @@ __kmp_release_bootstrap_lock(&__kmp_initz_lock); } +#if USE_UNSHACKLED_TASK +void __kmp_unshackled_initialize() { + if (TCR_4(__kmp_init_unshackled)) + return; + + // __kmp_parallel_initialize is required before we initialize unshackled + if (!TCR_4(__kmp_init_parallel)) + __kmp_parallel_initialize(); + + // Double check. Note that this double check should not be placed before + // __kmp_parallel_initialize as it will cause dead lock. + __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); + if (TCR_4(__kmp_init_unshackled)) { + __kmp_release_bootstrap_lock(&__kmp_initz_lock); + return; + } + + // Set the count of unshackled tasks to be executed to zero + KMP_ATOMIC_ST_REL(&__kmp_unexecuted_unshackled_tasks, 0); + + // Set the global variable indicating that we're initializing unshackled + // team/threads + TCW_SYNC_4(__kmp_init_unshackled_threads, TRUE); + + // Platform independent initialization + __kmp_do_initialize_unshackled_threads(); + + // Wait here for the finish of initialization of unshackled teams + if (TCR_4(__kmp_init_parallel)) { + __kmp_unshackled_threads_initz_wait(); + } + + // We have finished unshackled initialization + TCW_SYNC_4(__kmp_init_unshackled, TRUE); + + __kmp_release_bootstrap_lock(&__kmp_initz_lock); +} +#endif + /* ------------------------------------------------------------------------ */ void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, @@ -8336,7 +8427,6 @@ } } - void __kmp_omp_display_env(int verbose) { __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); if (__kmp_init_serial == 0) @@ -8344,3 +8434,47 @@ __kmp_display_env_impl(!verbose, verbose); __kmp_release_bootstrap_lock(&__kmp_initz_lock); } + +#if USE_UNSHACKLED_TASK +kmp_info_t **__kmp_unshackled_threads; +kmp_info_t *__kmp_unshackled_master_thread; +int __kmp_unshackled_threads_num; +std::atomic __kmp_unexecuted_unshackled_tasks; + +namespace { +std::atomic __kmp_hit_unshackled_threads_num; + +void __kmp_unshackled_wrapper_fn(int *gtid, int *, ...) { + // This is an explicit synchronization on all unshackled threads in case that + // when a regular thread pushes an unshackled task to one unshackled thread, + // the thread has not been awaken once since they're released by the master + // thread after creating the team. + KMP_ATOMIC_INC(&__kmp_hit_unshackled_threads_num); + while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_unshackled_threads_num) != + __kmp_unshackled_threads_num) + ; + + // If master thread, then wait for signal + if (__kmpc_master(nullptr, *gtid)) { + // First, unset the initial state and release the initial thread + TCW_4(__kmp_init_unshackled_threads, FALSE); + __kmp_unshackled_initz_release(); + __kmp_unshackled_master_thread_wait(); + } +} +} // namespace + +void __kmp_unshackled_threads_initz_routine() { + // Create a new root for unshackled team/threads + const int gtid = __kmp_register_root(TRUE); + __kmp_unshackled_master_thread = __kmp_threads[gtid]; + __kmp_unshackled_threads = &__kmp_threads[gtid]; + __kmp_unshackled_master_thread->th.th_set_nproc = + __kmp_unshackled_threads_num; + + KMP_ATOMIC_ST_REL(&__kmp_hit_unshackled_threads_num, 0); + + __kmpc_fork_call(nullptr, 0, __kmp_unshackled_wrapper_fn); +} + +#endif diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -503,6 +503,12 @@ if (nth < (4 * __kmp_xproc)) nth = (4 * __kmp_xproc); +#if USE_UNSHACKLED_TASK + // If unshackled task is enabled, we initialize the thread capacity with extra + // __kmp_unshackled_threads_num. + nth += __kmp_unshackled_threads_num; +#endif + if (nth > __kmp_max_nth) nth = __kmp_max_nth; diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h --- a/openmp/runtime/src/kmp_taskdeps.h +++ b/openmp/runtime/src/kmp_taskdeps.h @@ -121,7 +121,11 @@ KA_TRACE(20, ("__kmp_release_deps: T#%d successor %p of %p scheduled " "for execution.\n", gtid, successor->dn.task, task)); +#if USE_UNSHACKLED_TASK + __kmp_omp_task(task->encountering_gtid, successor->dn.task, false); +#else __kmp_omp_task(gtid, successor->dn.task, false); +#endif } } diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -325,6 +325,14 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + +#if USE_UNSHACKLED_TASK + if (taskdata->td_flags.unshackled) { + gtid = KMP_GTID_TO_SHADOW_GTID(gtid); + thread = __kmp_threads[gtid]; + } +#endif + kmp_task_team_t *task_team = thread->th.th_task_team; kmp_int32 tid = __kmp_tid_from_gtid(gtid); kmp_thread_data_t *thread_data; @@ -363,7 +371,8 @@ // Find tasking deque specific to encountering thread thread_data = &task_team->tt.tt_threads_data[tid]; - // No lock needed since only owner can allocate + // No lock needed even if the task is unshackled because we have initialized + // the dequeue for unshackled thread data if (thread_data->td.td_deque == NULL) { __kmp_alloc_task_deque(thread, thread_data); } @@ -429,6 +438,16 @@ __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); +#if USE_UNSHACKLED_TASK + // Signal one worker thread to execute the task + if (taskdata->td_flags.unshackled) { + // Increment the number of unshackled tasks to be executed + KMP_ATOMIC_INC(&__kmp_unexecuted_unshackled_tasks); + // Wake unshackled threads up if they're sleeping + __kmp_unshackled_worker_thread_signal(); + } +#endif + return TASK_SUCCESSFULLY_PUSHED; } @@ -721,7 +740,6 @@ #else /* ! USE_FAST_MEMORY */ __kmp_thread_free(thread, taskdata); #endif - KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); } @@ -930,8 +948,16 @@ __ompt_task_finish(task, resumed_task, ompt_task_complete); #endif +#if USE_UNSHACKLED_TASK + if (taskdata->td_flags.unshackled) { + KMP_DEBUG_ASSERT(taskdata->td_parent_task_team); + KMP_ATOMIC_DEC( + &taskdata->td_parent_task_team->tt.tt_unfinished_unshackled_tasks); + } +#endif + // Only need to keep track of count if team parallel and tasking not - // serialized, or task is detachable and event has already been fulfilled + // serialized, or task is detachable and event has already been fulfilled if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || taskdata->td_flags.detachable == TASK_DETACHABLE) { // Predecrement simulated by "- 1" calculation @@ -1182,6 +1208,10 @@ kmp_task_t *task; kmp_taskdata_t *taskdata; kmp_info_t *thread = __kmp_threads[gtid]; +#if USE_UNSHACKLED_TASK + kmp_info_t *encountering_thread = thread; + kmp_int32 encountering_gtid = gtid; +#endif kmp_team_t *team = thread->th.th_team; kmp_taskdata_t *parent_task = thread->th.th_current_task; size_t shareds_offset; @@ -1189,6 +1219,23 @@ if (!TCR_4(__kmp_init_middle)) __kmp_middle_initialize(); +#if USE_UNSHACKLED_TASK + if (flags->unshackled) { + if (!TCR_4(__kmp_init_unshackled)) { + __kmp_unshackled_initialize(); + } + + // For an unshackled task encountered by a regular thread, we will push the + // task to the (gtid%__kmp_unshackled_threads_num)-th unshackled thread + if (!KMP_UNSHACKLED_THREAD(gtid)) { + thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; + team = thread->th.th_team; + // We don't change the parent-child relation for unshackled task as we + // need that to do per-task-region synchronization + } + } +#endif + KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, @@ -1199,6 +1246,7 @@ } flags->final = 1; } + if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { // Untied task encountered causes the TSC algorithm to check entire deque of // the victim thread. If no untied task encountered, then checking the head @@ -1261,11 +1309,21 @@ // Avoid double allocation here by combining shareds with taskdata #if USE_FAST_MEMORY +#if USE_UNSHACKLED_TASK + taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( + encountering_thread, shareds_offset + sizeof_shareds); +#else taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + sizeof_shareds); +#endif #else /* ! USE_FAST_MEMORY */ +#if USE_UNSHACKLED_TASK + taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( + encountering_thread, shareds_offset + sizeof_shareds); +#else taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + sizeof_shareds); +#endif #endif /* USE_FAST_MEMORY */ ANNOTATE_HAPPENS_AFTER(taskdata); @@ -1293,7 +1351,11 @@ taskdata->td_task_id = KMP_GEN_TASK_ID(); taskdata->td_team = team; +#if USE_UNSHACKLED_TASK + taskdata->td_alloc_thread = encountering_thread; +#else taskdata->td_alloc_thread = thread; +#endif taskdata->td_parent = parent_task; taskdata->td_level = parent_task->td_level + 1; // increment nesting level KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); @@ -1312,6 +1374,11 @@ taskdata->td_flags.destructors_thunk = flags->destructors_thunk; taskdata->td_flags.proxy = flags->proxy; taskdata->td_flags.detachable = flags->detachable; +#if USE_UNSHACKLED_TASK + taskdata->td_flags.unshackled = flags->unshackled; + taskdata->td_parent_task_team = encountering_thread->th.th_task_team; + taskdata->encountering_gtid = encountering_gtid; +#endif taskdata->td_task_team = thread->th.th_task_team; taskdata->td_size_alloc = shareds_offset + sizeof_shareds; taskdata->td_flags.tasktype = TASK_EXPLICIT; @@ -1369,6 +1436,18 @@ } } +#if USE_UNSHACKLED_TASK + { + kmp_task_team_t *parent_team = taskdata->td_parent_task_team; + if (flags->unshackled && parent_team) { + KMP_ATOMIC_INC(&parent_team->tt.tt_unfinished_unshackled_tasks); + if (!parent_team->tt.tt_unshackled_task_encountered) { + TCW_4(parent_team->tt.tt_unshackled_task_encountered, TRUE); + } + } + } +#endif + KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", gtid, taskdata, taskdata->td_parent)); ANNOTATE_HAPPENS_BEFORE(task); @@ -1406,6 +1485,13 @@ size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id) { +#if USE_UNSHACKLED_TASK + kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; + input_flags->unshackled = TRUE; + // Unshackled thread is always final for now because it is created by the + // compiler and used only for async offloading + input_flags->final = TRUE; +#endif return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, sizeof_shareds, task_entry); } @@ -1478,6 +1564,15 @@ } #endif +#if USE_UNSHACKLED_TASK + // Decreament the counter of unshackled tasks to be executed + if (taskdata->td_flags.unshackled) { + // Unshackled tasks can only be executed by unshackled threads + KMP_ASSERT(KMP_UNSHACKLED_THREAD(gtid)); + KMP_ATOMIC_DEC(&__kmp_unexecuted_unshackled_tasks); + } +#endif + // Proxy tasks are not handled by the runtime if (taskdata->td_flags.proxy != TASK_PROXY) { ANNOTATE_HAPPENS_AFTER(task); @@ -1875,6 +1970,14 @@ must_wait = must_wait || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks); + +#if USE_UNSHACKLED_TASK + // If unshackled thread is encountered, we must enable wait here. + must_wait = must_wait || + (thread->th.th_task_team != NULL && + thread->th.th_task_team->tt.tt_unshackled_task_encountered); +#endif + if (must_wait) { kmp_flag_32 flag(RCAST(std::atomic *, &(taskdata->td_incomplete_child_tasks)), @@ -2840,7 +2943,13 @@ thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); +#if USE_UNSHACKLED_TASK + // This can happen when unshackled task is enabled + if (threads_data == nullptr) + return FALSE; +#else KMP_DEBUG_ASSERT(threads_data != NULL); +#endif nthreads = task_team->tt.tt_nproc; unfinished_threads = &(task_team->tt.tt_unfinished_threads); @@ -2924,8 +3033,8 @@ } } - if (task == NULL) // break out of tasking loop - break; + if (task == NULL) + break; // break out of tasking loop // Found a task; execute it #if USE_ITT_BUILD && USE_ITT_NOTIFY @@ -3380,6 +3489,10 @@ task_team->tt.tt_nproc = nthreads = team->t.t_nproc; KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); +#if USE_UNSHACKLED_TASK + KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_unshackled_tasks, 0); + TCW_4(task_team->tt.tt_unshackled_task_encountered, FALSE); +#endif TCW_4(task_team->tt.tt_active, TRUE); KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " @@ -3552,6 +3665,28 @@ ((team != NULL) ? team->t.t_id : -1), other_team)); } } + +#if USE_UNSHACKLED_TASK + // For regular thread, task enabling should be called when the task is going + // to be pushed to a dequeue. However, for the unshackled thread, we need it + // ahead of time so that some operations can be performed without race + // condition. + if (this_thr == __kmp_unshackled_master_thread) { + for (int i = 0; i < 2; ++i) { + kmp_task_team_t *task_team = team->t.t_task_team[i]; + if (KMP_TASKING_ENABLED(task_team)) { + continue; + } + __kmp_enable_tasking(task_team, this_thr); + for (int j = 0; j < task_team->tt.tt_nproc; ++j) { + kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j]; + if (thread_data->td.td_deque == NULL) { + __kmp_alloc_task_deque(__kmp_unshackled_threads[j], thread_data); + } + } + } + } +#endif } // __kmp_task_team_sync: Propagation of task team data from team to threads @@ -3618,6 +3753,12 @@ TCW_PTR(this_thr->th.th_task_team, NULL); } + +#if USE_UNSHACKLED_TASK + if (task_team && task_team->tt.tt_unshackled_task_encountered) + while (KMP_ATOMIC_LD_ACQ(&task_team->tt.tt_unfinished_unshackled_tasks)) + ; +#endif } // __kmp_tasking_barrier: diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h --- a/openmp/runtime/src/kmp_wait_release.h +++ b/openmp/runtime/src/kmp_wait_release.h @@ -381,6 +381,27 @@ break; } +#if USE_UNSHACKLED_TASK + // For unshackled thread, if task_team is nullptr, it means the master + // thread has not released the barrier. We cannot wait here because once the + // master thread releases all children barriers, all unshackled threads are + // still sleeping. This leads to a problem that following configuration, + // such as task team sync, will not be performed such that this thread does + // not have task team. Usually it is not bad. However, a corner case is, + // when the first task encountered is an untied task, the check in + // __kmp_task_alloc will crash because it uses the task team pointer without + // checking whether it is nullptr. It is probably under some kind of + // assumption. + if (task_team && KMP_UNSHACKLED_WORKER_THREAD(th_gtid)) { + // If there is still unshackled tasks to be executed, the unshackled + // thread will not enter a waiting status. + if (KMP_ATOMIC_LD_ACQ(&__kmp_unexecuted_unshackled_tasks) == 0) { + __kmp_unshackled_worker_thread_wait(); + } + continue; + } +#endif + // Don't suspend if KMP_BLOCKTIME is set to "infinite" if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -25,6 +25,7 @@ #include #endif #include // HUGE_VAL. +#include #include #include #include @@ -2439,7 +2440,7 @@ , void **exit_frame_ptr #endif - ) { +) { #if OMPT_SUPPORT *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0); #endif @@ -2518,4 +2519,107 @@ #endif +#if USE_UNSHACKLED_TASK + +namespace { +pthread_t __kmp_unshackled_master_thread_handle; + +// Condition variable for initializing unshackled team +pthread_cond_t __kmp_unshackled_threads_initz_cond_var; +pthread_mutex_t __kmp_unshackled_threads_initz_lock; + +// Condition variable for the wrapper function of master thread +pthread_cond_t __kmp_unshackled_master_thread_cond_var; +pthread_mutex_t _kmp_unshackled_master_thread_lock; + +// Semaphore for worker threads. We don't use condition variable here in case +// that when multiple signals are sent at the same time, only one thread might +// be waken. +sem_t __kmp_unshackled_task_sem; +} // namespace + +void __kmp_unshackled_worker_thread_wait() { + int status = sem_wait(&__kmp_unshackled_task_sem); + KMP_CHECK_SYSFAIL("sem_wait", status); +} + +void __kmp_do_initialize_unshackled_threads() { + // Initialize condition variable + int status = + pthread_cond_init(&__kmp_unshackled_threads_initz_cond_var, nullptr); + KMP_CHECK_SYSFAIL("pthread_cond_init", status); + + status = pthread_cond_init(&__kmp_unshackled_master_thread_cond_var, nullptr); + KMP_CHECK_SYSFAIL("pthread_cond_init", status); + + status = pthread_mutex_init(&__kmp_unshackled_threads_initz_lock, nullptr); + KMP_CHECK_SYSFAIL("pthread_mutex_init", status); + + status = pthread_mutex_init(&_kmp_unshackled_master_thread_lock, nullptr); + KMP_CHECK_SYSFAIL("pthread_mutex_init", status); + + // Initialize the semaphore + status = sem_init(&__kmp_unshackled_task_sem, 0, 0); + KMP_CHECK_SYSFAIL("sem_init", status); + + // Create a new thread to finish initialization + status = pthread_create( + &__kmp_unshackled_master_thread_handle, nullptr, + [](void *) -> void * { + __kmp_unshackled_threads_initz_routine(); + return nullptr; + }, + nullptr); + KMP_CHECK_SYSFAIL("pthread_create", status); +} + +void __kmp_unshackled_threads_initz_wait() { + // Initial thread waits here for the completion of the initialization. The + // condition variable will be notified by master thread of unshackled teams + int status = pthread_mutex_lock(&__kmp_unshackled_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + + if (TCR_4(__kmp_init_unshackled_threads)) { + status = pthread_cond_wait(&__kmp_unshackled_threads_initz_cond_var, + &__kmp_unshackled_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_cond_wait", status); + } + + status = pthread_mutex_unlock(&__kmp_unshackled_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + +void __kmp_unshackled_initz_release() { + // After all initialization, reset __kmp_init_unshackled_threads to false + int status = pthread_mutex_lock(&__kmp_unshackled_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + + status = pthread_cond_signal(&__kmp_unshackled_threads_initz_cond_var); + KMP_CHECK_SYSFAIL("pthread_cond_wait", status); + + TCW_SYNC_4(__kmp_init_unshackled_threads, FALSE); + + status = pthread_mutex_unlock(&__kmp_unshackled_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + +void __kmp_unshackled_master_thread_wait() { + // The master thread of unshackled team will be blocked here. The + // condition variable can only be signal in the destructor of RTL + int status = pthread_mutex_lock(&_kmp_unshackled_master_thread_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + status = pthread_cond_wait(&__kmp_unshackled_master_thread_cond_var, + &_kmp_unshackled_master_thread_lock); + KMP_CHECK_SYSFAIL("pthread_cond_wait", status); + status = pthread_mutex_unlock(&_kmp_unshackled_master_thread_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + +void __kmp_unshackled_worker_thread_signal() { + int status = sem_post(&__kmp_unshackled_task_sem); + KMP_CHECK_SYSFAIL("sem_post", status); +} + +#endif + // end of file // diff --git a/openmp/runtime/test/tasking/unshackled_task/depend.cpp b/openmp/runtime/test/tasking/unshackled_task/depend.cpp new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/tasking/unshackled_task/depend.cpp @@ -0,0 +1,168 @@ +// RUN: %libomp-cxx-compile-and-run + +/* + * This test aims to check whether unshackled task can work with regular task in + * terms of dependences. It is equivalent to the following code: + * + * #pragma omp parallel + * for for (int i = 0; i < N; ++i) { + * int data = -1; + * #pragma omp task shared(data) depend(out: data) + * { + * data = 1; + * } + * #pragma omp task shared(data) depend(inout: data) + * { + * data += 2; + * } + * #pragma omp task shared(data) depend(inout: data) + * { + * data += 4; + * } + * #pragma omp taskwait + * assert(data == 7); + * } + */ + +#include +#include + +extern "C" { +struct ident_t; + +using kmp_int32 = int32_t; +using kmp_int64 = int64_t; +using kmp_routine_entry_t = kmp_int32 (*)(kmp_int32, void *); +using kmp_intptr_t = intptr_t; + +typedef struct kmp_depend_info { + kmp_intptr_t base_addr; + size_t len; + struct { + bool in : 1; + bool out : 1; + bool mtx : 1; + } flags; +} kmp_depend_info_t; + +typedef union kmp_cmplrdata { + kmp_int32 priority; + kmp_routine_entry_t destructors; +} kmp_cmplrdata_t; + +typedef struct kmp_task { + void *shareds; + kmp_routine_entry_t routine; + kmp_int32 part_id; + kmp_cmplrdata_t data1; + kmp_cmplrdata_t data2; +} kmp_task_t; + +struct kmp_task_t_with_privates { + kmp_task_t task; +}; + +struct anon { + int32_t *data; +}; + +int32_t __kmpc_global_thread_num(void *); +kmp_task_t *__kmpc_omp_task_alloc(ident_t *, kmp_int32, kmp_int32, size_t, + size_t, kmp_routine_entry_t); +kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *, kmp_int32, kmp_int32, + size_t, size_t, kmp_routine_entry_t, + kmp_int64); +kmp_int32 __kmpc_omp_taskwait(ident_t *, kmp_int32); +kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid, + kmp_task_t *new_task, kmp_int32 ndeps, + kmp_depend_info_t *dep_list, + kmp_int32 ndeps_noalias, + kmp_depend_info_t *noalias_dep_list); +} + +kmp_int32 omp_task_entry_1(kmp_int32 gtid, kmp_task_t_with_privates *task) { + auto shareds = reinterpret_cast(task->task.shareds); + auto p = shareds->data; + *p = 1; + return 0; +} + +kmp_int32 omp_task_entry_2(kmp_int32 gtid, kmp_task_t_with_privates *task) { + auto shareds = reinterpret_cast(task->task.shareds); + auto p = shareds->data; + *p += 2; + return 0; +} + +kmp_int32 omp_task_entry_3(kmp_int32 gtid, kmp_task_t_with_privates *task) { + auto shareds = reinterpret_cast(task->task.shareds); + auto p = shareds->data; + *p += 4; + return 0; +} + +int main(int argc, char *argv[]) { + constexpr const int N = 1024; +#pragma omp parallel for + for (int i = 0; i < N; ++i) { + int32_t gtid = __kmpc_global_thread_num(nullptr); + int32_t data = 0; + + // Task 1 + auto task1 = __kmpc_omp_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry_1)); + + auto shareds = reinterpret_cast(task1->shareds); + shareds->data = &data; + + kmp_depend_info_t depinfo1; + depinfo1.base_addr = reinterpret_cast(&data); + depinfo1.flags.out = 1; + depinfo1.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task1, 1, &depinfo1, 0, nullptr); + + // Task 2 + auto task2 = __kmpc_omp_target_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry_2), -1); + + shareds = reinterpret_cast(task2->shareds); + shareds->data = &data; + + kmp_depend_info_t depinfo2; + depinfo2.base_addr = reinterpret_cast(&data); + depinfo2.flags.in = 1; + depinfo2.flags.out = 1; + depinfo2.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task2, 1, &depinfo2, 0, nullptr); + + // Task 3 + auto task3 = __kmpc_omp_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry_3)); + + shareds = reinterpret_cast(task3->shareds); + shareds->data = &data; + + kmp_depend_info_t depinfo3; + depinfo3.base_addr = reinterpret_cast(&data); + depinfo3.flags.in = 1; + depinfo3.flags.out = 1; + depinfo3.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task3, 1, &depinfo3, 0, nullptr); + + // Wait for all tasks + __kmpc_omp_taskwait(nullptr, gtid); + + assert(data == 7); + } + + std::cout << "PASS\n"; + return 0; +} + +// CHECK: PASS diff --git a/openmp/runtime/test/tasking/unshackled_task/gtid.cpp b/openmp/runtime/test/tasking/unshackled_task/gtid.cpp new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/tasking/unshackled_task/gtid.cpp @@ -0,0 +1,86 @@ +// RUN: %libomp-cxx-compile-and-run + +/* + * This test aims to check whether unshackled thread has right gtid. It is + * equivalent to the following code: + * + * #pragma omp parallel for + * for (int i = 0; i < N; ++i) { + * int data = -1; + * #pragma omp task unshackled shared(data) + * { + * data = omp_get_global_thread_id(); + * } + * #pragma omp taskwait + * assert(data > 0 && data <= __kmp_num_unshackled_threads); + * } + */ + +#include +#include + +extern "C" { +struct ident_t; + +using kmp_int32 = int32_t; +using kmp_int64 = int64_t; +using kmp_routine_entry_t = kmp_int32 (*)(kmp_int32, void *); + +typedef union kmp_cmplrdata { + kmp_int32 priority; + kmp_routine_entry_t destructors; +} kmp_cmplrdata_t; + +typedef struct kmp_task { + void *shareds; + kmp_routine_entry_t routine; + kmp_int32 part_id; + kmp_cmplrdata_t data1; + kmp_cmplrdata_t data2; +} kmp_task_t; + +struct kmp_task_t_with_privates { + kmp_task_t task; +}; + +struct anon { + int32_t *data; +}; + +int32_t __kmpc_global_thread_num(void *) __attribute__((weak)); +kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *, kmp_int32, kmp_int32, + size_t, size_t, kmp_routine_entry_t, + kmp_int64); +kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32, kmp_task_t *); +kmp_int32 __kmpc_omp_taskwait(ident_t *, kmp_int32); +} + +kmp_int32 omp_task_entry(kmp_int32 gtid, kmp_task_t_with_privates *task) { + auto shareds = reinterpret_cast(task->task.shareds); + auto p = shareds->data; + *p = __kmpc_global_thread_num(nullptr); + return 0; +} + +int main(int argc, char *argv[]) { + constexpr const int N = 1024; +#pragma omp parallel for + for (int i = 0; i < N; ++i) { + int32_t gtid = __kmpc_global_thread_num(nullptr); + auto task = __kmpc_omp_target_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry), -1); + auto shareds = reinterpret_cast(task->shareds); + int32_t data = -1; + shareds->data = &data; + __kmpc_omp_task(nullptr, gtid, task); + __kmpc_omp_taskwait(nullptr, gtid); + // FIXME: 8 here is not accurate + assert(data > 0 && data <= 8); + } + + std::cout << "PASS\n"; + return 0; +} + +// CHECK: PASS