diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2334,7 +2334,8 @@ unsigned priority_specified : 1; /* set if the compiler provides priority setting for the task */ unsigned detachable : 1; /* 1 == can detach */ - unsigned reserved : 9; /* reserved for compiler use */ + unsigned hidden_helper : 1; /* 1 == hidden helper task */ + unsigned reserved : 8; /* reserved for compiler use */ /* Library flags */ /* Total library flags must be 16 bits */ unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */ @@ -2382,6 +2383,13 @@ kmp_depnode_t *td_depnode; // Pointer to graph node if this task has dependencies kmp_task_team_t *td_task_team; + // The global thread id of the encountering thread. We need it because when a + // regular task depends on a hidden helper task, and the hidden helper task + // is finished on a hidden helper thread, it will call __kmp_release_deps to + // release all dependences. If now the task is a regular task, we need to pass + // the encountering gtid such that the task will be picked up and executed by + // its encountering team instead of hidden helper team. + kmp_int32 encountering_gtid; size_t td_size_alloc; // Size of task structure, including shareds etc. #if defined(KMP_GOMP_COMPAT) // 4 or 8 byte integers for the loop bounds in GOMP_taskloop @@ -2449,6 +2457,9 @@ kmp_int32 tt_max_threads; // # entries allocated for threads_data array kmp_int32 tt_found_proxy_tasks; // found proxy tasks since last barrier kmp_int32 tt_untied_task_encountered; + // There is hidden helper thread encountered in this task team so that we must + // wait when waiting on task team + kmp_int32 tt_hidden_helper_task_encountered; KMP_ALIGN_CACHE std::atomic tt_unfinished_threads; /* #threads still active */ @@ -2917,6 +2928,7 @@ extern volatile int __kmp_init_monitor; #endif extern volatile int __kmp_init_user_locks; +extern volatile int __kmp_init_hidden_helper_threads; extern int __kmp_init_counter; extern int __kmp_root_counter; extern int __kmp_version; @@ -3985,6 +3997,45 @@ extern void __kmp_omp_display_env(int verbose); +// 1: it is initializing hidden helper team +extern volatile int __kmp_init_hidden_helper; +// 1: the hidden helper team is done +extern volatile int __kmp_hidden_helper_team_done; +// 1: enable hidden helper task +extern kmp_int32 __kmp_enable_hidden_helper; +// Main thread of hidden helper team +extern kmp_info_t *__kmp_hidden_helper_main_thread; +// Descriptors for the hidden helper threads +extern kmp_info_t **__kmp_hidden_helper_threads; +// Number of hidden helper threads +extern kmp_int32 __kmp_hidden_helper_threads_num; +// Number of hidden helper tasks that have not been executed yet +extern std::atomic __kmp_unexecuted_hidden_helper_tasks; + +extern void __kmp_hidden_helper_initialize(); +extern void __kmp_hidden_helper_threads_initz_routine(); +extern void __kmp_do_initialize_hidden_helper_threads(); +extern void __kmp_hidden_helper_threads_initz_wait(); +extern void __kmp_hidden_helper_initz_release(); +extern void __kmp_hidden_helper_threads_deinitz_wait(); +extern void __kmp_hidden_helper_threads_deinitz_release(); +extern void __kmp_hidden_helper_main_thread_wait(); +extern void __kmp_hidden_helper_worker_thread_wait(); +extern void __kmp_hidden_helper_worker_thread_signal(); +extern void __kmp_hidden_helper_main_thread_release(); + +// Check whether a given thread is a hidden helper thread +#define KMP_HIDDEN_HELPER_THREAD(gtid) \ + ((gtid) >= 1 && (gtid) <= __kmp_hidden_helper_threads_num) + +#define KMP_HIDDEN_HELPER_WORKER_THREAD(gtid) \ + ((gtid) > 1 && (gtid) <= __kmp_hidden_helper_threads_num) + +// Map a gtid to a hidden helper thread. The first hidden helper thread, a.k.a +// main thread, is skipped. +#define KMP_GTID_TO_SHADOW_GTID(gtid) \ + ((gtid) % (__kmp_hidden_helper_threads_num - 1) + 2) + #ifdef __cplusplus } #endif diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -46,6 +46,9 @@ volatile int __kmp_init_common = FALSE; volatile int __kmp_init_middle = FALSE; volatile int __kmp_init_parallel = FALSE; +volatile int __kmp_init_hidden_helper = FALSE; +volatile int __kmp_init_hidden_helper_threads = FALSE; +volatile int __kmp_hidden_helper_team_done = FALSE; #if KMP_USE_MONITOR volatile int __kmp_init_monitor = 0; /* 1 - launched, 2 - actually started (Windows* OS only) */ diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -3644,15 +3644,37 @@ } } - /* find an available thread slot */ - /* Don't reassign the zero slot since we need that to only be used by initial - thread */ - for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL; - gtid++) - ; - KA_TRACE(1, - ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); - KMP_ASSERT(gtid < __kmp_threads_capacity); + // When hidden helper task is enabled, __kmp_threads is organized as follows: + // 0: initial thread, also a regular OpenMP thread. + // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. + // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for + // regular OpenMP threads. + if (TCR_4(__kmp_init_hidden_helper_threads)) { + // Find an available thread slot for hidden helper thread. Slots for hidden + // helper threads start from 1 to __kmp_hidden_helper_threads_num. + for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && + gtid <= __kmp_hidden_helper_threads_num; + gtid++) + ; + KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); + KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " + "hidden helper thread: T#%d\n", + gtid)); + } else { + /* find an available thread slot */ + // Don't reassign the zero slot since we need that to only be used by + // initial thread. Slots for hidden helper threads should also be skipped. + if (initial_thread && __kmp_threads[0] == NULL) { + gtid = 0; + } else { + for (gtid = __kmp_hidden_helper_threads_num + 1; + TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) + ; + } + KA_TRACE( + 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); + KMP_ASSERT(gtid < __kmp_threads_capacity); + } /* update global accounting */ __kmp_all_nth++; @@ -4303,8 +4325,20 @@ #endif KMP_MB(); - for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { - KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); + + { + int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) + ? 1 + : __kmp_hidden_helper_threads_num + 1; + + for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; + ++new_gtid) { + KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); + } + + if (TCR_4(__kmp_init_hidden_helper_threads)) { + KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); + } } /* allocate space for it. */ @@ -6249,6 +6283,15 @@ return; } + // If hidden helper team has been initialized, we need to deinit it + if (TCR_4(__kmp_init_hidden_helper)) { + TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); + // First release the main thread to let it continue its work + __kmp_hidden_helper_main_thread_release(); + // Wait until the hidden helper team has been destroyed + __kmp_hidden_helper_threads_deinitz_wait(); + } + KMP_MB(); /* Flush all pending memory write invalidates. */ /* find out who we are and what we should do */ @@ -7125,6 +7168,41 @@ __kmp_release_bootstrap_lock(&__kmp_initz_lock); } +void __kmp_hidden_helper_initialize() { + if (TCR_4(__kmp_init_hidden_helper)) + return; + + // __kmp_parallel_initialize is required before we initialize hidden helper + if (!TCR_4(__kmp_init_parallel)) + __kmp_parallel_initialize(); + + // Double check. Note that this double check should not be placed before + // __kmp_parallel_initialize as it will cause dead lock. + __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); + if (TCR_4(__kmp_init_hidden_helper)) { + __kmp_release_bootstrap_lock(&__kmp_initz_lock); + return; + } + + // Set the count of hidden helper tasks to be executed to zero + KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); + + // Set the global variable indicating that we're initializing hidden helper + // team/threads + TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); + + // Platform independent initialization + __kmp_do_initialize_hidden_helper_threads(); + + // Wait here for the finish of initialization of hidden helper teams + __kmp_hidden_helper_threads_initz_wait(); + + // We have finished hidden helper initialization + TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); + + __kmp_release_bootstrap_lock(&__kmp_initz_lock); +} + /* ------------------------------------------------------------------------ */ void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, @@ -8470,7 +8548,6 @@ } } - void __kmp_omp_display_env(int verbose) { __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); if (__kmp_init_serial == 0) @@ -8478,3 +8555,59 @@ __kmp_display_env_impl(!verbose, verbose); __kmp_release_bootstrap_lock(&__kmp_initz_lock); } + +// Globals and functions for hidden helper task +kmp_info_t **__kmp_hidden_helper_threads; +kmp_info_t *__kmp_hidden_helper_main_thread; +kmp_int32 __kmp_hidden_helper_threads_num = 8; +std::atomic __kmp_unexecuted_hidden_helper_tasks; +#if KMP_OS_LINUX +kmp_int32 __kmp_enable_hidden_helper = TRUE; +#else +kmp_int32 __kmp_enable_hidden_helper = FALSE; +#endif + +namespace { +std::atomic __kmp_hit_hidden_helper_threads_num; + +void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { + // This is an explicit synchronization on all hidden helper threads in case + // that when a regular thread pushes a hidden helper task to one hidden + // helper thread, the thread has not been awaken once since they're released + // by the main thread after creating the team. + KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); + while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != + __kmp_hidden_helper_threads_num) + ; + + // If main thread, then wait for signal + if (__kmpc_master(nullptr, *gtid)) { + // First, unset the initial state and release the initial thread + TCW_4(__kmp_init_hidden_helper_threads, FALSE); + __kmp_hidden_helper_initz_release(); + __kmp_hidden_helper_main_thread_wait(); + // Now wake up all worker threads + for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { + __kmp_hidden_helper_worker_thread_signal(); + } + } +} +} // namespace + +void __kmp_hidden_helper_threads_initz_routine() { + // Create a new root for hidden helper team/threads + const int gtid = __kmp_register_root(TRUE); + __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; + __kmp_hidden_helper_threads = &__kmp_threads[gtid]; + __kmp_hidden_helper_main_thread->th.th_set_nproc = + __kmp_hidden_helper_threads_num; + + KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); + + __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); + + // Set the initialization flag to FALSE + TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); + + __kmp_hidden_helper_threads_deinitz_release(); +} diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -503,6 +503,11 @@ if (nth < (4 * __kmp_xproc)) nth = (4 * __kmp_xproc); + // If hidden helper task is enabled, we initialize the thread capacity with + // extra + // __kmp_hidden_helper_threads_num. + nth += __kmp_hidden_helper_threads_num; + if (nth > __kmp_max_nth) nth = __kmp_max_nth; @@ -1161,6 +1166,39 @@ K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth)); } // __kmp_stg_parse_num_threads +static void __kmp_stg_parse_num_hidden_helper_threads(char const *name, + char const *value, + void *data) { + __kmp_stg_parse_int(name, value, 0, 16, &__kmp_hidden_helper_threads_num); + // If the number of hidden helper threads is zero, we disable hidden helper + // task + if (__kmp_hidden_helper_threads_num == 0) { + __kmp_enable_hidden_helper = FALSE; + } +} // __kmp_stg_parse_num_hidden_helper_threads + +static void __kmp_stg_print_num_hidden_helper_threads(kmp_str_buf_t *buffer, + char const *name, + void *data) { + __kmp_stg_print_int(buffer, name, __kmp_hidden_helper_threads_num); +} // __kmp_stg_print_num_hidden_helper_threads + +static void __kmp_stg_parse_use_hidden_helper(char const *name, + char const *value, void *data) { + __kmp_stg_parse_bool(name, value, &__kmp_enable_hidden_helper); +#if !KMP_OS_LINUX + __kmp_enable_hidden_helper = FALSE; + K_DIAG(1, + ("__kmp_stg_parse_use_hidden_helper: Disable hidden helper task on " + "non-Linux platform although it is enabled by user explicitly.\n")); +#endif +} // __kmp_stg_parse_use_hidden_helper + +static void __kmp_stg_print_use_hidden_helper(kmp_str_buf_t *buffer, + char const *name, void *data) { + __kmp_stg_print_bool(buffer, name, __kmp_enable_hidden_helper); +} // __kmp_stg_print_use_hidden_helper + static void __kmp_stg_print_num_threads(kmp_str_buf_t *buffer, char const *name, void *data) { if (__kmp_env_format) { @@ -4954,6 +4992,11 @@ __kmp_stg_print_omp_cancellation, NULL, 0, 0}, {"OMP_ALLOCATOR", __kmp_stg_parse_allocator, __kmp_stg_print_allocator, NULL, 0, 0}, + {"LIBOMP_USE_HIDDEN_HELPER_TASK", __kmp_stg_parse_use_hidden_helper, + __kmp_stg_print_use_hidden_helper, NULL, 0, 0}, + {"LIBOMP_NUM_HIDDEN_HELPER_THREADS", + __kmp_stg_parse_num_hidden_helper_threads, + __kmp_stg_print_num_hidden_helper_threads, NULL, 0, 0}, #if OMPT_SUPPORT {"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0, diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h --- a/openmp/runtime/src/kmp_taskdeps.h +++ b/openmp/runtime/src/kmp_taskdeps.h @@ -119,6 +119,7 @@ KMP_RELEASE_DEPNODE(gtid, node); kmp_depnode_list_t *next; + kmp_taskdata_t *next_taskdata; for (kmp_depnode_list_t *p = node->dn.successors; p; p = next) { kmp_depnode_t *successor = p->node; kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->dn.npredecessors) - 1; @@ -131,7 +132,24 @@ KA_TRACE(20, ("__kmp_release_deps: T#%d successor %p of %p scheduled " "for execution.\n", gtid, successor->dn.task, task)); - __kmp_omp_task(gtid, successor->dn.task, false); + // If a regular task depending on a hidden helper task, when the + // hidden helper task is done, the regular task should be executed by + // its encountering team. + if (KMP_HIDDEN_HELPER_THREAD(gtid)) { + // Hidden helper thread can only execute hidden helper tasks + KMP_ASSERT(task->td_flags.hidden_helper); + next_taskdata = KMP_TASK_TO_TASKDATA(successor->dn.task); + // If the dependent task is a regular task, we need to push to its + // encountering thread's queue; otherwise, it can be pushed to its own + // queue. + if (!next_taskdata->td_flags.hidden_helper) { + __kmp_omp_task(task->encountering_gtid, successor->dn.task, false); + } else { + __kmp_omp_task(gtid, successor->dn.task, false); + } + } else { + __kmp_omp_task(gtid, successor->dn.task, false); + } } } diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp --- a/openmp/runtime/src/kmp_taskdeps.cpp +++ b/openmp/runtime/src/kmp_taskdeps.cpp @@ -588,7 +588,9 @@ current_task->td_flags.tasking_ser || current_task->td_flags.final; kmp_task_team_t *task_team = thread->th.th_task_team; - serial = serial && !(task_team && task_team->tt.tt_found_proxy_tasks); + serial = serial && + !(task_team && (task_team->tt.tt_found_proxy_tasks || + task_team->tt.tt_hidden_helper_task_encountered)); if (!serial && (ndeps > 0 || ndeps_noalias > 0)) { /* if no dependencies have been tracked yet, create the dependence hash */ diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -325,6 +325,12 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + + if (taskdata->td_flags.hidden_helper) { + gtid = KMP_GTID_TO_SHADOW_GTID(gtid); + thread = __kmp_threads[gtid]; + } + kmp_task_team_t *task_team = thread->th.th_task_team; kmp_int32 tid = __kmp_tid_from_gtid(gtid); kmp_thread_data_t *thread_data; @@ -363,7 +369,9 @@ // Find tasking deque specific to encountering thread thread_data = &task_team->tt.tt_threads_data[tid]; - // No lock needed since only owner can allocate + // No lock needed since only owner can allocate. If the task is hidden_helper, + // we don't need it either because we have initialized the dequeue for hidden + // helper thread data. if (UNLIKELY(thread_data->td.td_deque == NULL)) { __kmp_alloc_task_deque(thread, thread_data); } @@ -429,6 +437,12 @@ __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); + // Signal one worker thread to execute the task + if (taskdata->td_flags.hidden_helper) { + // Wake hidden helper threads up if they're sleeping + __kmp_hidden_helper_worker_thread_signal(); + } + return TASK_SUCCESSFULLY_PUSHED; } @@ -721,7 +735,6 @@ #else /* ! USE_FAST_MEMORY */ __kmp_thread_free(thread, taskdata); #endif - KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); } @@ -922,7 +935,8 @@ // Only need to keep track of count if team parallel and tasking not // serialized, or task is detachable and event has already been fulfilled if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || - taskdata->td_flags.detachable == TASK_DETACHABLE) { + taskdata->td_flags.detachable == TASK_DETACHABLE || + taskdata->td_flags.hidden_helper) { // Predecrement simulated by "- 1" calculation children = KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; @@ -1171,6 +1185,7 @@ kmp_task_t *task; kmp_taskdata_t *taskdata; kmp_info_t *thread = __kmp_threads[gtid]; + kmp_info_t *encountering_thread = thread; kmp_team_t *team = thread->th.th_team; kmp_taskdata_t *parent_task = thread->th.th_current_task; size_t shareds_offset; @@ -1178,6 +1193,25 @@ if (UNLIKELY(!TCR_4(__kmp_init_middle))) __kmp_middle_initialize(); + if (flags->hidden_helper) { + if (__kmp_enable_hidden_helper) { + if (!TCR_4(__kmp_init_hidden_helper)) + __kmp_hidden_helper_initialize(); + + // For a hidden helper task encountered by a regular thread, we will push + // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper + // thread. + if (!KMP_HIDDEN_HELPER_THREAD(gtid)) { + thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; + // We don't change the parent-child relation for hidden helper task as + // we need that to do per-task-region synchronization. + } + } else { + // If the hidden helper task is not enabled, reset the flag to FALSE. + flags->hidden_helper = FALSE; + } + } + KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, @@ -1188,24 +1222,27 @@ } flags->final = 1; } + if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { // Untied task encountered causes the TSC algorithm to check entire deque of // the victim thread. If no untied task encountered, then checking the head // of the deque should be enough. - KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); + KMP_CHECK_UPDATE( + encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1); } // Detachable tasks are not proxy tasks yet but could be in the future. Doing // the tasking setup // when that happens is too late. - if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) { + if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE || + flags->hidden_helper) { if (flags->proxy == TASK_PROXY) { flags->tiedness = TASK_UNTIED; flags->merged_if0 = 1; } /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */ - if ((thread->th.th_task_team) == NULL) { + if ((encountering_thread->th.th_task_team) == NULL) { /* This should only happen if the team is serialized setup a task team and propagate it to the thread */ KMP_DEBUG_ASSERT(team->t.t_serialized); @@ -1213,28 +1250,33 @@ ("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid)); __kmp_task_team_setup( - thread, team, + encountering_thread, team, 1); // 1 indicates setup the current team regardless of nthreads - thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; + encountering_thread->th.th_task_team = + team->t.t_task_team[encountering_thread->th.th_task_state]; } - kmp_task_team_t *task_team = thread->th.th_task_team; + kmp_task_team_t *task_team = encountering_thread->th.th_task_team; /* tasking must be enabled now as the task might not be pushed */ if (!KMP_TASKING_ENABLED(task_team)) { KA_TRACE( 30, ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); - __kmp_enable_tasking(task_team, thread); - kmp_int32 tid = thread->th.th_info.ds.ds_tid; + __kmp_enable_tasking(task_team, encountering_thread); + kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid; kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; // No lock needed since only owner can allocate if (thread_data->td.td_deque == NULL) { - __kmp_alloc_task_deque(thread, thread_data); + __kmp_alloc_task_deque(encountering_thread, thread_data); } } - if (task_team->tt.tt_found_proxy_tasks == FALSE) + if (flags->proxy == TASK_PROXY && + task_team->tt.tt_found_proxy_tasks == FALSE) TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); + if (flags->hidden_helper && + task_team->tt.tt_hidden_helper_task_encountered == FALSE) + TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE); } // Calculate shared structure offset including padding after kmp_task_t struct @@ -1248,13 +1290,13 @@ KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, sizeof_shareds)); -// Avoid double allocation here by combining shareds with taskdata + // Avoid double allocation here by combining shareds with taskdata #if USE_FAST_MEMORY - taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + - sizeof_shareds); + taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( + encountering_thread, shareds_offset + sizeof_shareds); #else /* ! USE_FAST_MEMORY */ - taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + - sizeof_shareds); + taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( + encountering_thread, shareds_offset + sizeof_shareds); #endif /* USE_FAST_MEMORY */ ANNOTATE_HAPPENS_AFTER(taskdata); @@ -1281,8 +1323,8 @@ task->part_id = 0; // AC: Always start with 0 part id taskdata->td_task_id = KMP_GEN_TASK_ID(); - taskdata->td_team = team; - taskdata->td_alloc_thread = thread; + taskdata->td_team = thread->th.th_team; + taskdata->td_alloc_thread = encountering_thread; taskdata->td_parent = parent_task; taskdata->td_level = parent_task->td_level + 1; // increment nesting level KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); @@ -1301,6 +1343,8 @@ taskdata->td_flags.destructors_thunk = flags->destructors_thunk; taskdata->td_flags.proxy = flags->proxy; taskdata->td_flags.detachable = flags->detachable; + taskdata->td_flags.hidden_helper = flags->hidden_helper; + taskdata->encountering_gtid = gtid; taskdata->td_task_team = thread->th.th_task_team; taskdata->td_size_alloc = shareds_offset + sizeof_shareds; taskdata->td_flags.tasktype = TASK_EXPLICIT; @@ -1342,12 +1386,11 @@ if (UNLIKELY(ompt_enabled.enabled)) __ompt_task_init(taskdata, gtid); #endif -// Only need to keep track of child task counts if team parallel and tasking not -// serialized or if it is a proxy or detachable task - if (flags->proxy == TASK_PROXY || - flags->detachable == TASK_DETACHABLE || - !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) - { + // Only need to keep track of child task counts if team parallel and tasking + // not serialized or if it is a proxy or detachable or hidden helper task + if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE || + flags->hidden_helper || + !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); if (parent_task->td_taskgroup) KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); @@ -1358,6 +1401,12 @@ } } + if (flags->hidden_helper) { + taskdata->td_flags.task_serial = FALSE; + // Increment the number of hidden helper tasks to be executed + KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks); + } + KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", gtid, taskdata, taskdata->td_parent)); ANNOTATE_HAPPENS_BEFORE(task); @@ -1395,6 +1444,11 @@ size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id) { + if (__kmp_enable_hidden_helper) { + auto &input_flags = reinterpret_cast(flags); + input_flags.hidden_helper = TRUE; + } + return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, sizeof_shareds, task_entry); } @@ -1467,6 +1521,13 @@ } #endif + // Decreament the counter of hidden helper tasks to be executed + if (taskdata->td_flags.hidden_helper) { + // Hidden helper tasks can only be executed by hidden helper threads + KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid)); + KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks); + } + // Proxy tasks are not handled by the runtime if (taskdata->td_flags.proxy != TASK_PROXY) { ANNOTATE_HAPPENS_AFTER(task); @@ -1864,6 +1925,12 @@ must_wait = must_wait || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks); + // If hidden helper thread is encountered, we must enable wait here. + must_wait = + must_wait || + (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL && + thread->th.th_task_team->tt.tt_hidden_helper_task_encountered); + if (must_wait) { kmp_flag_32 flag( RCAST(std::atomic *, @@ -2830,11 +2897,13 @@ thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); + KMP_DEBUG_ASSERT(threads_data != NULL); nthreads = task_team->tt.tt_nproc; unfinished_threads = &(task_team->tt.tt_unfinished_threads); - KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); + KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks || + task_team->tt.tt_hidden_helper_task_encountered); KMP_DEBUG_ASSERT(*unfinished_threads >= 0); while (1) { // Outer loop keeps trying to find tasks in case of single thread @@ -2914,8 +2983,8 @@ } } - if (task == NULL) // break out of tasking loop - break; + if (task == NULL) + break; // break out of tasking loop // Found a task; execute it #if USE_ITT_BUILD && USE_ITT_NOTIFY @@ -3001,7 +3070,8 @@ // We could be getting tasks from target constructs; if this is the only // thread, keep trying to execute tasks from own queue - if (nthreads == 1) + if (nthreads == 1 && + KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks)) use_own_tasks = 1; else { KA_TRACE(15, @@ -3389,6 +3459,7 @@ task_team->tt.tt_nproc = nthreads = team->t.t_nproc; KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); + TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); TCW_4(task_team->tt.tt_active, TRUE); KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " @@ -3561,6 +3632,26 @@ ((team != NULL) ? team->t.t_id : -1), other_team)); } } + + // For regular thread, task enabling should be called when the task is going + // to be pushed to a dequeue. However, for the hidden helper thread, we need + // it ahead of time so that some operations can be performed without race + // condition. + if (this_thr == __kmp_hidden_helper_main_thread) { + for (int i = 0; i < 2; ++i) { + kmp_task_team_t *task_team = team->t.t_task_team[i]; + if (KMP_TASKING_ENABLED(task_team)) { + continue; + } + __kmp_enable_tasking(task_team, this_thr); + for (int j = 0; j < task_team->tt.tt_nproc; ++j) { + kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j]; + if (thread_data->td.td_deque == NULL) { + __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data); + } + } + } + } } // __kmp_task_team_sync: Propagation of task team data from team to threads diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h --- a/openmp/runtime/src/kmp_wait_release.h +++ b/openmp/runtime/src/kmp_wait_release.h @@ -389,6 +389,26 @@ break; } + // For hidden helper thread, if task_team is nullptr, it means the main + // thread has not released the barrier. We cannot wait here because once the + // main thread releases all children barriers, all hidden helper threads are + // still sleeping. This leads to a problem that following configuration, + // such as task team sync, will not be performed such that this thread does + // not have task team. Usually it is not bad. However, a corner case is, + // when the first task encountered is an untied task, the check in + // __kmp_task_alloc will crash because it uses the task team pointer without + // checking whether it is nullptr. It is probably under some kind of + // assumption. + if (task_team && KMP_HIDDEN_HELPER_WORKER_THREAD(th_gtid) && + !TCR_4(__kmp_hidden_helper_team_done)) { + // If there is still hidden helper tasks to be executed, the hidden helper + // thread will not enter a waiting status. + if (KMP_ATOMIC_LD_ACQ(&__kmp_unexecuted_hidden_helper_tasks) == 0) { + __kmp_hidden_helper_worker_thread_wait(); + } + continue; + } + // Don't suspend if KMP_BLOCKTIME is set to "infinite" if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && __kmp_pause_status != kmp_soft_paused) diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -25,6 +25,7 @@ #include #endif #include // HUGE_VAL. +#include #include #include #include @@ -2447,7 +2448,7 @@ , void **exit_frame_ptr #endif - ) { +) { #if OMPT_SUPPORT *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0); #endif @@ -2526,4 +2527,165 @@ #endif +// Functions for hidden helper task +namespace { +// Condition variable for initializing hidden helper team +pthread_cond_t hidden_helper_threads_initz_cond_var; +pthread_mutex_t hidden_helper_threads_initz_lock; +volatile int hidden_helper_initz_signaled = FALSE; + +// Condition variable for deinitializing hidden helper team +pthread_cond_t hidden_helper_threads_deinitz_cond_var; +pthread_mutex_t hidden_helper_threads_deinitz_lock; +volatile int hidden_helper_deinitz_signaled = FALSE; + +// Condition variable for the wrapper function of main thread +pthread_cond_t hidden_helper_main_thread_cond_var; +pthread_mutex_t hidden_helper_main_thread_lock; +volatile int hidden_helper_main_thread_signaled = FALSE; + +// Semaphore for worker threads. We don't use condition variable here in case +// that when multiple signals are sent at the same time, only one thread might +// be waken. +sem_t hidden_helper_task_sem; +} // namespace + +void __kmp_hidden_helper_worker_thread_wait() { + int status = sem_wait(&hidden_helper_task_sem); + KMP_CHECK_SYSFAIL("sem_wait", status); +} + +void __kmp_do_initialize_hidden_helper_threads() { + // Initialize condition variable + int status = + pthread_cond_init(&hidden_helper_threads_initz_cond_var, nullptr); + KMP_CHECK_SYSFAIL("pthread_cond_init", status); + + status = pthread_cond_init(&hidden_helper_threads_deinitz_cond_var, nullptr); + KMP_CHECK_SYSFAIL("pthread_cond_init", status); + + status = pthread_cond_init(&hidden_helper_main_thread_cond_var, nullptr); + KMP_CHECK_SYSFAIL("pthread_cond_init", status); + + status = pthread_mutex_init(&hidden_helper_threads_initz_lock, nullptr); + KMP_CHECK_SYSFAIL("pthread_mutex_init", status); + + status = pthread_mutex_init(&hidden_helper_threads_deinitz_lock, nullptr); + KMP_CHECK_SYSFAIL("pthread_mutex_init", status); + + status = pthread_mutex_init(&hidden_helper_main_thread_lock, nullptr); + KMP_CHECK_SYSFAIL("pthread_mutex_init", status); + + // Initialize the semaphore + status = sem_init(&hidden_helper_task_sem, 0, 0); + KMP_CHECK_SYSFAIL("sem_init", status); + + // Create a new thread to finish initialization + pthread_t handle; + status = pthread_create( + &handle, nullptr, + [](void *) -> void * { + __kmp_hidden_helper_threads_initz_routine(); + return nullptr; + }, + nullptr); + KMP_CHECK_SYSFAIL("pthread_create", status); +} + +void __kmp_hidden_helper_threads_initz_wait() { + // Initial thread waits here for the completion of the initialization. The + // condition variable will be notified by main thread of hidden helper teams. + int status = pthread_mutex_lock(&hidden_helper_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + + if (!TCR_4(hidden_helper_initz_signaled)) { + status = pthread_cond_wait(&hidden_helper_threads_initz_cond_var, + &hidden_helper_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_cond_wait", status); + } + + status = pthread_mutex_unlock(&hidden_helper_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + +void __kmp_hidden_helper_initz_release() { + // After all initialization, reset __kmp_init_hidden_helper_threads to false. + int status = pthread_mutex_lock(&hidden_helper_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + + status = pthread_cond_signal(&hidden_helper_threads_initz_cond_var); + KMP_CHECK_SYSFAIL("pthread_cond_wait", status); + + TCW_SYNC_4(hidden_helper_initz_signaled, TRUE); + + status = pthread_mutex_unlock(&hidden_helper_threads_initz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + +void __kmp_hidden_helper_main_thread_wait() { + // The main thread of hidden helper team will be blocked here. The + // condition variable can only be signal in the destructor of RTL. + int status = pthread_mutex_lock(&hidden_helper_main_thread_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + + if (!TCR_4(hidden_helper_main_thread_signaled)) { + status = pthread_cond_wait(&hidden_helper_main_thread_cond_var, + &hidden_helper_main_thread_lock); + KMP_CHECK_SYSFAIL("pthread_cond_wait", status); + } + + status = pthread_mutex_unlock(&hidden_helper_main_thread_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + +void __kmp_hidden_helper_main_thread_release() { + // The initial thread of OpenMP RTL should call this function to wake up the + // main thread of hidden helper team. + int status = pthread_mutex_lock(&hidden_helper_main_thread_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + + status = pthread_cond_signal(&hidden_helper_main_thread_cond_var); + KMP_CHECK_SYSFAIL("pthread_cond_signal", status); + + // The hidden helper team is done here + TCW_SYNC_4(hidden_helper_main_thread_signaled, TRUE); + + status = pthread_mutex_unlock(&hidden_helper_main_thread_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + +void __kmp_hidden_helper_worker_thread_signal() { + int status = sem_post(&hidden_helper_task_sem); + KMP_CHECK_SYSFAIL("sem_post", status); +} + +void __kmp_hidden_helper_threads_deinitz_wait() { + // Initial thread waits here for the completion of the deinitialization. The + // condition variable will be notified by main thread of hidden helper teams. + int status = pthread_mutex_lock(&hidden_helper_threads_deinitz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + + if (!TCR_4(hidden_helper_deinitz_signaled)) { + status = pthread_cond_wait(&hidden_helper_threads_deinitz_cond_var, + &hidden_helper_threads_deinitz_lock); + KMP_CHECK_SYSFAIL("pthread_cond_wait", status); + } + + status = pthread_mutex_unlock(&hidden_helper_threads_deinitz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + +void __kmp_hidden_helper_threads_deinitz_release() { + int status = pthread_mutex_lock(&hidden_helper_threads_deinitz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_lock", status); + + status = pthread_cond_signal(&hidden_helper_threads_deinitz_cond_var); + KMP_CHECK_SYSFAIL("pthread_cond_wait", status); + + TCW_SYNC_4(hidden_helper_deinitz_signaled, TRUE); + + status = pthread_mutex_unlock(&hidden_helper_threads_deinitz_lock); + KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status); +} + // end of file // diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp --- a/openmp/runtime/src/z_Windows_NT_util.cpp +++ b/openmp/runtime/src/z_Windows_NT_util.cpp @@ -1629,3 +1629,40 @@ return running_threads; } //__kmp_get_load_balance() + +// Functions for hidden helper task +void __kmp_hidden_helper_worker_thread_wait() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} + +void __kmp_do_initialize_hidden_helper_threads() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} + +void __kmp_hidden_helper_threads_initz_wait() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} + +void __kmp_hidden_helper_initz_release() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} + +void __kmp_hidden_helper_main_thread_wait() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} + +void __kmp_hidden_helper_main_thread_release() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} + +void __kmp_hidden_helper_worker_thread_signal() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} + +void __kmp_hidden_helper_threads_deinitz_wait() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} + +void __kmp_hidden_helper_threads_deinitz_release() { + KMP_ASSERT(0 && "Hidden helper task is not supported on Windows"); +} diff --git a/openmp/runtime/test/tasking/hidden_helper_task/common.h b/openmp/runtime/test/tasking/hidden_helper_task/common.h new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/common.h @@ -0,0 +1,59 @@ +#include +#include +#include + +extern "C" { +struct ident_t; + +using kmp_int32 = int32_t; +using kmp_int64 = int64_t; +using kmp_routine_entry_t = kmp_int32 (*)(kmp_int32, void *); +using kmp_intptr_t = intptr_t; + +typedef struct kmp_depend_info { + kmp_intptr_t base_addr; + size_t len; + struct { + bool in : 1; + bool out : 1; + bool mtx : 1; + } flags; +} kmp_depend_info_t; + +typedef union kmp_cmplrdata { + kmp_int32 priority; + kmp_routine_entry_t destructors; +} kmp_cmplrdata_t; + +typedef struct kmp_task { + void *shareds; + kmp_routine_entry_t routine; + kmp_int32 part_id; + kmp_cmplrdata_t data1; + kmp_cmplrdata_t data2; +} kmp_task_t; + +int32_t __kmpc_global_thread_num(void *); +kmp_task_t *__kmpc_omp_task_alloc(ident_t *, kmp_int32, kmp_int32, size_t, + size_t, kmp_routine_entry_t); +kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *, kmp_int32, kmp_int32, + size_t, size_t, kmp_routine_entry_t, + kmp_int64); +kmp_int32 __kmpc_omp_taskwait(ident_t *, kmp_int32); +kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32, kmp_task_t *); +kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid, + kmp_task_t *new_task, kmp_int32 ndeps, + kmp_depend_info_t *dep_list, + kmp_int32 ndeps_noalias, + kmp_depend_info_t *noalias_dep_list); +void __kmpc_taskgroup(ident_t *, kmp_int32); +void __kmpc_end_taskgroup(ident_t *, kmp_int32); +} + +static kmp_int32 get_num_hidden_helper_threads() { + static kmp_int32 __kmp_hidden_helper_threads_num = 8; + if (const char *env = std::getenv("LIBOMP_NUM_HIDDEN_HELPER_THREADS")) { + return std::stoi(env); + } + return __kmp_hidden_helper_threads_num; +} diff --git a/openmp/runtime/test/tasking/hidden_helper_task/depend.cpp b/openmp/runtime/test/tasking/hidden_helper_task/depend.cpp new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/depend.cpp @@ -0,0 +1,131 @@ +// RUN: %libomp-cxx-compile-and-run + +/* + * This test aims to check whether hidden helper task can work with regular task + * in terms of dependences. It is equivalent to the following code: + * + * #pragma omp parallel + * for (int i = 0; i < N; ++i) { + * int data = -1; + * #pragma omp task shared(data) depend(out: data) + * { + * data = 1; + * } + * #pragma omp hidden helper task shared(data) depend(inout: data) + * { + * data += 2; + * } + * #pragma omp hidden helper task shared(data) depend(inout: data) + * { + * data += 4; + * } + * #pragma omp task shared(data) depend(inout: data) + * { + * data += 8; + * } + * #pragma omp taskwait + * assert(data == 15); + * } + */ + +#include "common.h" + +extern "C" { +struct kmp_task_t_with_privates { + kmp_task_t task; +}; + +struct anon { + int32_t *data; +}; +} + +template +kmp_int32 omp_task_entry(kmp_int32 gtid, kmp_task_t_with_privates *task) { + auto shareds = reinterpret_cast(task->task.shareds); + auto p = shareds->data; + *p += I; + return 0; +} + +int main(int argc, char *argv[]) { + constexpr const int N = 1024; +#pragma omp parallel for + for (int i = 0; i < N; ++i) { + int32_t gtid = __kmpc_global_thread_num(nullptr); + int32_t data = 0; + + // Task 1 + auto task1 = __kmpc_omp_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry<1>)); + + auto shareds = reinterpret_cast(task1->shareds); + shareds->data = &data; + + kmp_depend_info_t depinfo1; + depinfo1.base_addr = reinterpret_cast(&data); + depinfo1.flags.out = 1; + depinfo1.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task1, 1, &depinfo1, 0, nullptr); + + // Task 2 + auto task2 = __kmpc_omp_target_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry<2>), -1); + + shareds = reinterpret_cast(task2->shareds); + shareds->data = &data; + + kmp_depend_info_t depinfo2; + depinfo2.base_addr = reinterpret_cast(&data); + depinfo2.flags.in = 1; + depinfo2.flags.out = 1; + depinfo2.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task2, 1, &depinfo2, 0, nullptr); + + // Task 3 + auto task3 = __kmpc_omp_target_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry<4>), -1); + + shareds = reinterpret_cast(task3->shareds); + shareds->data = &data; + + kmp_depend_info_t depinfo3; + depinfo3.base_addr = reinterpret_cast(&data); + depinfo3.flags.in = 1; + depinfo3.flags.out = 1; + depinfo3.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task3, 1, &depinfo3, 0, nullptr); + + // Task 4 + auto task4 = __kmpc_omp_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry<8>)); + + shareds = reinterpret_cast(task4->shareds); + shareds->data = &data; + + kmp_depend_info_t depinfo4; + depinfo4.base_addr = reinterpret_cast(&data); + depinfo4.flags.in = 1; + depinfo4.flags.out = 1; + depinfo4.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task4, 1, &depinfo4, 0, nullptr); + + // Wait for all tasks + __kmpc_omp_taskwait(nullptr, gtid); + + assert(data == 15); + } + + std::cout << "PASS\n"; + return 0; +} + +// CHECK: PASS diff --git a/openmp/runtime/test/tasking/hidden_helper_task/gtid.cpp b/openmp/runtime/test/tasking/hidden_helper_task/gtid.cpp new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/gtid.cpp @@ -0,0 +1,132 @@ +// RUN: %libomp-cxx-compile-and-run + +/* + * This test aims to check whether hidden helper thread has right gtid. We also + * test if there is mixed dependences between regular tasks and hidden helper + * tasks, the tasks are executed by right set of threads. It is equivalent to + * the following code: + * + * #pragma omp parallel for + * for (int i = 0; i < N; ++i) { + * int data1 = -1, data2 = -1, data3 = -1; + * int depvar; + * #pragma omp task shared(data1) depend(inout: depvar) + * { + * data1 = omp_get_global_thread_id(); + * } + * #pragma omp task hidden helper shared(data2) depend(inout: depvar) + * { + * data2 = omp_get_global_thread_id(); + * } + * #pragma omp task shared(data3) depend(inout: depvar) + * { + * data3 = omp_get_global_thread_id(); + * } + * #pragma omp taskwait + * assert(data1 == 0 || data1 > __kmp_num_hidden_helper_threads); + * assert(data2 > 0 && data2 <= __kmp_num_hidden_helper_threads); + * assert(data3 == 0 || data3 > __kmp_num_hidden_helper_threads); + * } + */ + +#include "common.h" + +extern "C" { +struct kmp_task_t_with_privates { + kmp_task_t task; +}; + +struct anon { + int32_t *data; +}; +} + +kmp_int32 __kmp_hidden_helper_threads_num; + +kmp_int32 omp_task_entry(kmp_int32 gtid, kmp_task_t_with_privates *task) { + auto shareds = reinterpret_cast(task->task.shareds); + auto p = shareds->data; + *p = __kmpc_global_thread_num(nullptr); + return 0; +} + +template void assert_gtid(int v) { + if (__kmp_hidden_helper_threads_num) { + if (hidden_helper_task) { + assert(v > 0 && v <= __kmp_hidden_helper_threads_num); + } else { + assert(v == 0 || v > __kmp_hidden_helper_threads_num); + } + } else { + assert(v >= 0); + } +} + +int main(int argc, char *argv[]) { + __kmp_hidden_helper_threads_num = get_num_hidden_helper_threads(); + + constexpr const int N = 1024; +#pragma omp parallel for + for (int i = 0; i < N; ++i) { + int32_t data1 = -1, data2 = -1, data3 = -1; + int depvar; + int32_t gtid = __kmpc_global_thread_num(nullptr); + + // Task 1, regular task + auto task1 = __kmpc_omp_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry)); + auto shareds = reinterpret_cast(task1->shareds); + shareds->data = &data1; + + kmp_depend_info_t depinfo1; + depinfo1.base_addr = reinterpret_cast(&depvar); + depinfo1.flags.in = 1; + depinfo1.flags.out = 1; + depinfo1.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task1, 1, &depinfo1, 0, nullptr); + + // Task 2, hidden helper task + auto task2 = __kmpc_omp_target_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry), -1); + shareds = reinterpret_cast(task2->shareds); + shareds->data = &data2; + + kmp_depend_info_t depinfo2; + depinfo2.base_addr = reinterpret_cast(&depvar); + depinfo2.flags.in = 1; + depinfo2.flags.out = 1; + depinfo2.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task2, 1, &depinfo2, 0, nullptr); + + // Task 3, regular task + auto task3 = __kmpc_omp_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry)); + shareds = reinterpret_cast(task3->shareds); + shareds->data = &data3; + + kmp_depend_info_t depinfo3; + depinfo3.base_addr = reinterpret_cast(&depvar); + depinfo3.flags.in = 1; + depinfo3.flags.out = 1; + depinfo3.len = 4; + + __kmpc_omp_task_with_deps(nullptr, gtid, task3, 1, &depinfo3, 0, nullptr); + + __kmpc_omp_taskwait(nullptr, gtid); + + // FIXME: 8 here is not accurate + assert_gtid(data1); + assert_gtid(data2); + assert_gtid(data3); + } + + std::cout << "PASS\n"; + return 0; +} + +// CHECK: PASS diff --git a/openmp/runtime/test/tasking/hidden_helper_task/taskgroup.cpp b/openmp/runtime/test/tasking/hidden_helper_task/taskgroup.cpp new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/taskgroup.cpp @@ -0,0 +1,78 @@ +// RUN: %libomp-cxx-compile-and-run + +/* + * This test aims to check whether hidden helper task can work with regular task + * in terms of dependences. It is equivalent to the following code: + * + * #pragma omp parallel + * for (int i = 0; i < N; ++i) { + * int data1 = 0, data2 = 0; + * #pragma omp taskgroup + * { + * #pragma omp hidden helper task shared(data1) + * { + * data1 = 1; + * } + * #pragma omp hidden helper task shared(data2) + * { + * data2 = 2; + * } + * } + * assert(data1 == 1); + * assert(data2 == 2); + * } + */ + +#include "common.h" + +extern "C" { +struct kmp_task_t_with_privates { + kmp_task_t task; +}; + +struct anon { + int32_t *data; +}; +} + +template +kmp_int32 omp_task_entry(kmp_int32 gtid, kmp_task_t_with_privates *task) { + auto shareds = reinterpret_cast(task->task.shareds); + auto p = shareds->data; + *p = I; + return 0; +} + +int main(int argc, char *argv[]) { + constexpr const int N = 1024; +#pragma omp parallel for + for (int i = 0; i < N; ++i) { + int32_t gtid = __kmpc_global_thread_num(nullptr); + int32_t data1 = 0, data2 = 0; + __kmpc_taskgroup(nullptr, gtid); + + auto task1 = __kmpc_omp_target_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry<1>), -1); + auto shareds = reinterpret_cast(task1->shareds); + shareds->data = &data1; + __kmpc_omp_task(nullptr, gtid, task1); + + auto task2 = __kmpc_omp_target_task_alloc( + nullptr, gtid, 1, sizeof(kmp_task_t_with_privates), sizeof(anon), + reinterpret_cast(omp_task_entry<2>), -1); + shareds = reinterpret_cast(task2->shareds); + shareds->data = &data2; + __kmpc_omp_task(nullptr, gtid, task2); + + __kmpc_end_taskgroup(nullptr, gtid); + + assert(data1 == 1); + assert(data2 == 2); + } + + std::cout << "PASS\n"; + return 0; +} + +// CHECK: PASS diff --git a/openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c b/openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c --- a/openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c +++ b/openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c @@ -5,6 +5,7 @@ value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations. */ #include +#include #include #if defined(WIN32) || defined(_WIN32) @@ -47,6 +48,9 @@ // End of definitions copied from OpenMP RTL. // --------------------------------------------------------------------------- static id loc = {0, 2, 0, 0, ";file;func;0;0;;"}; +// This variable is defined in OpenMP RTL but we can't have it exposed so we +// need to redefine it here. +static int __kmp_hidden_helper_threads_num = 8; // --------------------------------------------------------------------------- int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) { @@ -58,6 +62,9 @@ int rc; int tid = omp_get_thread_num(); int gtid = tid; + if (gtid) { + gtid += __kmp_hidden_helper_threads_num; + } int last; #if DEBUG printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n", @@ -210,6 +217,9 @@ int rc; int tid = omp_get_thread_num(); int gtid = tid; + if (gtid) { + gtid += __kmp_hidden_helper_threads_num; + } int last; #if DEBUG printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n", @@ -397,6 +407,13 @@ // --------------------------------------------------------------------------- int main() { + { + const char *env = getenv("LIBOMP_NUM_HIDDEN_HELPER_THREADS"); + if (env) { + __kmp_hidden_helper_threads_num = atoi(env); + } + } + int n, err = 0; for (n = 1; n <= 4; ++ n) { err += run_32(n);