Index: openmp/trunk/runtime/src/CMakeLists.txt =================================================================== --- openmp/trunk/runtime/src/CMakeLists.txt +++ openmp/trunk/runtime/src/CMakeLists.txt @@ -73,7 +73,6 @@ kmp_settings.cpp kmp_str.cpp kmp_tasking.cpp - kmp_taskq.cpp kmp_threadprivate.cpp kmp_utility.cpp kmp_barrier.cpp @@ -330,4 +329,3 @@ DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH} ) endif() - Index: openmp/trunk/runtime/src/dllexports =================================================================== --- openmp/trunk/runtime/src/dllexports +++ openmp/trunk/runtime/src/dllexports @@ -227,8 +227,8 @@ __kmpc_end_ordered 125 __kmpc_end_serialized_parallel 126 __kmpc_end_single 127 - __kmpc_end_taskq 128 - __kmpc_end_taskq_task 129 +# __kmpc_end_taskq 128 +# __kmpc_end_taskq_task 129 __kmpc_flush 130 __kmpc_for_static_fini 135 __kmpc_for_static_init_4 136 @@ -245,10 +245,10 @@ __kmpc_push_num_threads 148 __kmpc_serialized_parallel 150 __kmpc_single 151 - __kmpc_task 152 - __kmpc_task_buffer 153 - __kmpc_taskq 154 - __kmpc_taskq_task 155 +# __kmpc_task 152 +# __kmpc_task_buffer 153 +# __kmpc_taskq 154 +# __kmpc_taskq_task 155 __kmpc_threadprivate 156 __kmpc_threadprivate_cached 157 __kmpc_threadprivate_register 158 Index: openmp/trunk/runtime/src/kmp.h =================================================================== --- openmp/trunk/runtime/src/kmp.h +++ openmp/trunk/runtime/src/kmp.h @@ -1165,28 +1165,15 @@ ct_pdo_ordered, ct_psections, ct_psingle, - - /* the following must be left in order and not split up */ - ct_taskq, - ct_task, // really task inside non-ordered taskq, considered worksharing type - ct_task_ordered, /* really task inside ordered taskq, considered a worksharing - type */ - /* the preceding must be left in order and not split up */ - ct_critical, ct_ordered_in_parallel, ct_ordered_in_pdo, - ct_ordered_in_taskq, ct_master, ct_reduce, ct_barrier }; -/* test to see if we are in a taskq construct */ -#define IS_CONS_TYPE_TASKQ(ct) \ - (((int)(ct)) >= ((int)ct_taskq) && ((int)(ct)) <= ((int)ct_task_ordered)) -#define IS_CONS_TYPE_ORDERED(ct) \ - ((ct) == ct_pdo_ordered || (ct) == ct_task_ordered) +#define IS_CONS_TYPE_ORDERED(ct) ((ct) == ct_pdo_ordered) struct cons_data { ident_t const *ident; @@ -1260,163 +1247,6 @@ #define KMP_ITT_DEBUG 0 #endif /* USE_ITT_BUILD */ -/* Taskq data structures */ - -#define HIGH_WATER_MARK(nslots) (((nslots)*3) / 4) -// num thunks that each thread can simultaneously execute from a task queue -#define __KMP_TASKQ_THUNKS_PER_TH 1 - -/* flags for taskq_global_flags, kmp_task_queue_t tq_flags, kmpc_thunk_t - th_flags */ - -#define TQF_IS_ORDERED 0x0001 // __kmpc_taskq interface, taskq ordered -// __kmpc_taskq interface, taskq with lastprivate list -#define TQF_IS_LASTPRIVATE 0x0002 -#define TQF_IS_NOWAIT 0x0004 // __kmpc_taskq interface, end taskq nowait -// __kmpc_taskq interface, use heuristics to decide task queue size -#define TQF_HEURISTICS 0x0008 - -// __kmpc_taskq interface, reserved for future use -#define TQF_INTERFACE_RESERVED1 0x0010 -// __kmpc_taskq interface, reserved for future use -#define TQF_INTERFACE_RESERVED2 0x0020 -// __kmpc_taskq interface, reserved for future use -#define TQF_INTERFACE_RESERVED3 0x0040 -// __kmpc_taskq interface, reserved for future use -#define TQF_INTERFACE_RESERVED4 0x0080 - -#define TQF_INTERFACE_FLAGS 0x00ff // all the __kmpc_taskq interface flags -// internal/read by instrumentation; only used with TQF_IS_LASTPRIVATE -#define TQF_IS_LAST_TASK 0x0100 -// internal use only; this thunk->th_task is the taskq_task -#define TQF_TASKQ_TASK 0x0200 -// internal use only; must release worker threads once ANY queued task -// exists (global) -#define TQF_RELEASE_WORKERS 0x0400 -// internal use only; notify workers that master has finished enqueuing tasks -#define TQF_ALL_TASKS_QUEUED 0x0800 -// internal use only: this queue encountered in parallel context: not serialized -#define TQF_PARALLEL_CONTEXT 0x1000 -// internal use only; this queue is on the freelist and not in use -#define TQF_DEALLOCATED 0x2000 - -#define TQF_INTERNAL_FLAGS 0x3f00 // all the internal use only flags - -typedef struct KMP_ALIGN_CACHE kmpc_aligned_int32_t { - kmp_int32 ai_data; -} kmpc_aligned_int32_t; - -typedef struct KMP_ALIGN_CACHE kmpc_aligned_queue_slot_t { - struct kmpc_thunk_t *qs_thunk; -} kmpc_aligned_queue_slot_t; - -typedef struct kmpc_task_queue_t { - /* task queue linkage fields for n-ary tree of queues (locked with global - taskq_tree_lck) */ - kmp_lock_t tq_link_lck; /* lock for child link, child next/prev links and - child ref counts */ - union { - struct kmpc_task_queue_t *tq_parent; // pointer to parent taskq, not locked - // for taskq internal freelists, locked with global taskq_freelist_lck - struct kmpc_task_queue_t *tq_next_free; - } tq; - // pointer to linked-list of children, locked by tq's tq_link_lck - volatile struct kmpc_task_queue_t *tq_first_child; - // next child in linked-list, locked by parent tq's tq_link_lck - struct kmpc_task_queue_t *tq_next_child; - // previous child in linked-list, locked by parent tq's tq_link_lck - struct kmpc_task_queue_t *tq_prev_child; - // reference count of threads with access to this task queue - volatile kmp_int32 tq_ref_count; - /* (other than the thread executing the kmpc_end_taskq call) */ - /* locked by parent tq's tq_link_lck */ - - /* shared data for task queue */ - /* per-thread array of pointers to shared variable structures */ - struct kmpc_aligned_shared_vars_t *tq_shareds; - /* only one array element exists for all but outermost taskq */ - - /* bookkeeping for ordered task queue */ - kmp_uint32 tq_tasknum_queuing; // ordered task # assigned while queuing tasks - // ordered number of next task to be served (executed) - volatile kmp_uint32 tq_tasknum_serving; - - /* thunk storage management for task queue */ - kmp_lock_t tq_free_thunks_lck; /* lock for thunk freelist manipulation */ - // thunk freelist, chained via th.th_next_free - struct kmpc_thunk_t *tq_free_thunks; - // space allocated for thunks for this task queue - struct kmpc_thunk_t *tq_thunk_space; - - /* data fields for queue itself */ - kmp_lock_t tq_queue_lck; /* lock for [de]enqueue operations: tq_queue, - tq_head, tq_tail, tq_nfull */ - /* array of queue slots to hold thunks for tasks */ - kmpc_aligned_queue_slot_t *tq_queue; - volatile struct kmpc_thunk_t *tq_taskq_slot; /* special slot for taskq task - thunk, occupied if not NULL */ - kmp_int32 tq_nslots; /* # of tq_thunk_space thunks alloc'd (not incl. - tq_taskq_slot space) */ - kmp_int32 tq_head; // enqueue puts item here (index into tq_queue array) - kmp_int32 tq_tail; // dequeue takes item from here (index into tq_queue array) - volatile kmp_int32 tq_nfull; // # of occupied entries in task queue right now - kmp_int32 tq_hiwat; /* high-water mark for tq_nfull and queue scheduling */ - volatile kmp_int32 tq_flags; /* TQF_xxx */ - - /* bookkeeping for outstanding thunks */ - - /* per-thread array for # of regular thunks currently being executed */ - struct kmpc_aligned_int32_t *tq_th_thunks; - kmp_int32 tq_nproc; /* number of thunks in the th_thunks array */ - - /* statistics library bookkeeping */ - ident_t *tq_loc; /* source location information for taskq directive */ -} kmpc_task_queue_t; - -typedef void (*kmpc_task_t)(kmp_int32 global_tid, struct kmpc_thunk_t *thunk); - -/* sizeof_shareds passed as arg to __kmpc_taskq call */ -typedef struct kmpc_shared_vars_t { /* aligned during dynamic allocation */ - kmpc_task_queue_t *sv_queue; /* (pointers to) shared vars */ -} kmpc_shared_vars_t; - -typedef struct KMP_ALIGN_CACHE kmpc_aligned_shared_vars_t { - volatile struct kmpc_shared_vars_t *ai_data; -} kmpc_aligned_shared_vars_t; - -/* sizeof_thunk passed as arg to kmpc_taskq call */ -typedef struct kmpc_thunk_t { /* aligned during dynamic allocation */ - union { /* field used for internal freelists too */ - kmpc_shared_vars_t *th_shareds; - struct kmpc_thunk_t *th_next_free; /* freelist of individual thunks within - queue, head at tq_free_thunks */ - } th; - kmpc_task_t th_task; /* taskq_task if flags & TQF_TASKQ_TASK */ - struct kmpc_thunk_t *th_encl_thunk; /* pointer to dynamically enclosing thunk - on this thread's call stack */ - // TQF_xxx(tq_flags interface plus possible internal flags) - kmp_int32 th_flags; - - kmp_int32 th_status; - kmp_uint32 th_tasknum; /* task number assigned in order of queuing, used for - ordered sections */ - /* private vars */ -} kmpc_thunk_t; - -typedef struct KMP_ALIGN_CACHE kmp_taskq { - int tq_curr_thunk_capacity; - - kmpc_task_queue_t *tq_root; - kmp_int32 tq_global_flags; - - kmp_lock_t tq_freelist_lck; - kmpc_task_queue_t *tq_freelist; - - kmpc_thunk_t **tq_curr_thunk; -} kmp_taskq_t; - -/* END Taskq data structures */ - typedef kmp_int32 kmp_critical_name[8]; /*! @@ -2770,7 +2600,6 @@ std::atomic t_cancel_request; #endif int t_master_active; // save on fork, restore on join - kmp_taskq_t t_taskq; // this team's task queue void *t_copypriv_data; // team specific pointer to copyprivate data array #if KMP_OS_WINDOWS std::atomic t_copyin_counter; @@ -3776,24 +3605,6 @@ extern void KMPC_SET_DYNAMIC(int flag); extern void KMPC_SET_NESTED(int flag); -/* Taskq interface routines */ -KMP_EXPORT kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid, - kmpc_task_t taskq_task, - size_t sizeof_thunk, - size_t sizeof_shareds, kmp_int32 flags, - kmpc_shared_vars_t **shareds); -KMP_EXPORT void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, - kmpc_thunk_t *thunk); -KMP_EXPORT kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid, - kmpc_thunk_t *thunk); -KMP_EXPORT void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, - kmpc_thunk_t *thunk, kmp_int32 status); -KMP_EXPORT void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid, - kmpc_thunk_t *thunk); -KMP_EXPORT kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid, - kmpc_thunk_t *taskq_thunk, - kmpc_task_t task); - /* OMP 3.0 tasking interface routines */ KMP_EXPORT kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task); Index: openmp/trunk/runtime/src/kmp_error.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_error.cpp +++ openmp/trunk/runtime/src/kmp_error.cpp @@ -28,10 +28,8 @@ "\"sections\"", "work-sharing", /* this is not called "single" because of lowering of "sections" pragmas */ - "\"taskq\"", "\"taskq\"", "\"taskq ordered\"", "\"critical\"", - "\"ordered\"", /* in PARALLEL */ + "\"critical\"", "\"ordered\"", /* in PARALLEL */ "\"ordered\"", /* in PDO */ - "\"ordered\"", /* in TASKQ */ "\"master\"", "\"reduce\"", "\"barrier\""}; #define get_src(ident) ((ident) == NULL ? NULL : (ident)->psource) @@ -214,9 +212,7 @@ if (p->stack_top >= p->stack_size) { __kmp_expand_cons_stack(gtid, p); } - if (p->w_top > p->p_top && - !(IS_CONS_TYPE_TASKQ(p->stack_data[p->w_top].type) && - IS_CONS_TYPE_TASKQ(ct))) { + if (p->w_top > p->p_top) { // We are already in a WORKSHARE construct for this PARALLEL region. __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident, &p->stack_data[p->w_top]); @@ -257,8 +253,7 @@ if (p->stack_top >= p->stack_size) __kmp_expand_cons_stack(gtid, p); - if (ct == ct_ordered_in_parallel || ct == ct_ordered_in_pdo || - ct == ct_ordered_in_taskq) { + if (ct == ct_ordered_in_parallel || ct == ct_ordered_in_pdo) { if (p->w_top <= p->p_top) { /* we are not in a worksharing construct */ #ifdef BUILD_PARALLEL_ORDERED @@ -270,13 +265,8 @@ } else { /* inside a WORKSHARING construct for this PARALLEL region */ if (!IS_CONS_TYPE_ORDERED(p->stack_data[p->w_top].type)) { - if (p->stack_data[p->w_top].type == ct_taskq) { - __kmp_error_construct2(kmp_i18n_msg_CnsNotInTaskConstruct, ct, ident, - &p->stack_data[p->w_top]); - } else { - __kmp_error_construct2(kmp_i18n_msg_CnsNoOrderedClause, ct, ident, - &p->stack_data[p->w_top]); - } + __kmp_error_construct2(kmp_i18n_msg_CnsNoOrderedClause, ct, ident, + &p->stack_data[p->w_top]); } } if (p->s_top > p->p_top && p->s_top > p->w_top) { @@ -288,10 +278,8 @@ if (stack_type == ct_critical || ((stack_type == ct_ordered_in_parallel || - stack_type == ct_ordered_in_pdo || - stack_type == - ct_ordered_in_taskq) && /* C doesn't allow named ordered; - ordered in ordered gets error */ + stack_type == ct_ordered_in_pdo) && + /* C doesn't allow named ordered; ordered in ordered gets error */ p->stack_data[index].ident != NULL && (p->stack_data[index].ident->flags & KMP_IDENT_KMPC))) { /* we are in ORDERED which is inside an ORDERED or CRITICAL construct */ @@ -399,9 +387,8 @@ if (tos != p->w_top || (p->stack_data[tos].type != ct && - // below are two exceptions to the rule that construct types must match - !(p->stack_data[tos].type == ct_pdo_ordered && ct == ct_pdo) && - !(p->stack_data[tos].type == ct_task_ordered && ct == ct_task))) { + // below is the exception to the rule that construct types must match + !(p->stack_data[tos].type == ct_pdo_ordered && ct == ct_pdo))) { __kmp_check_null_func(); __kmp_error_construct2(kmp_i18n_msg_CnsExpectedEnd, ct, ident, &p->stack_data[tos]); Index: openmp/trunk/runtime/src/kmp_runtime.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_runtime.cpp +++ openmp/trunk/runtime/src/kmp_runtime.cpp @@ -532,10 +532,6 @@ &team->t.t_disp_buffer[num_disp_buff], sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer", header, team_id); - - __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data, - sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, - team_id); } static void __kmp_init_allocator() { @@ -4524,8 +4520,6 @@ team->t.t_ordered.dt.t_value = 0; team->t.t_master_active = FALSE; - memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t)); - #ifdef KMP_DEBUG team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ #endif Index: openmp/trunk/runtime/src/kmp_taskq.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_taskq.cpp +++ openmp/trunk/runtime/src/kmp_taskq.cpp @@ -1,2022 +0,0 @@ -/* - * kmp_taskq.cpp -- TASKQ support for OpenMP. - */ - -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "kmp.h" -#include "kmp_error.h" -#include "kmp_i18n.h" -#include "kmp_io.h" - -#define MAX_MESSAGE 512 - -/* Taskq routines and global variables */ - -#define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x); - -#define THREAD_ALLOC_FOR_TASKQ - -static int in_parallel_context(kmp_team_t *team) { - return !team->t.t_serialized; -} - -static void __kmp_taskq_eo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { - int gtid = *gtid_ref; - int tid = __kmp_tid_from_gtid(gtid); - kmp_uint32 my_token; - kmpc_task_queue_t *taskq; - kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq; - - if (__kmp_env_consistency_check) -#if KMP_USE_DYNAMIC_LOCK - __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL, 0); -#else - __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL); -#endif - - if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) { - KMP_MB(); /* Flush all pending memory write invalidates. */ - - /* GEH - need check here under stats to make sure */ - /* inside task (curr_thunk[*tid_ref] != NULL) */ - - my_token = tq->tq_curr_thunk[tid]->th_tasknum; - - taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue; - - KMP_WAIT(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL); - KMP_MB(); - } -} - -static void __kmp_taskq_xo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { - int gtid = *gtid_ref; - int tid = __kmp_tid_from_gtid(gtid); - kmp_uint32 my_token; - kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq; - - if (__kmp_env_consistency_check) - __kmp_pop_sync(gtid, ct_ordered_in_taskq, loc_ref); - - if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) { - KMP_MB(); /* Flush all pending memory write invalidates. */ - - /* GEH - need check here under stats to make sure */ - /* inside task (curr_thunk[tid] != NULL) */ - - my_token = tq->tq_curr_thunk[tid]->th_tasknum; - - KMP_MB(); /* Flush all pending memory write invalidates. */ - - tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue->tq_tasknum_serving = - my_token + 1; - - KMP_MB(); /* Flush all pending memory write invalidates. */ - } -} - -static void __kmp_taskq_check_ordered(kmp_int32 gtid, kmpc_thunk_t *thunk) { - kmp_uint32 my_token; - kmpc_task_queue_t *taskq; - - /* assume we are always called from an active parallel context */ - - KMP_MB(); /* Flush all pending memory write invalidates. */ - - my_token = thunk->th_tasknum; - - taskq = thunk->th.th_shareds->sv_queue; - - if (taskq->tq_tasknum_serving <= my_token) { - KMP_WAIT(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL); - KMP_MB(); - taskq->tq_tasknum_serving = my_token + 1; - KMP_MB(); - } -} - -#ifdef KMP_DEBUG - -static void __kmp_dump_TQF(kmp_int32 flags) { - if (flags & TQF_IS_ORDERED) - __kmp_printf("ORDERED "); - if (flags & TQF_IS_LASTPRIVATE) - __kmp_printf("LAST_PRIV "); - if (flags & TQF_IS_NOWAIT) - __kmp_printf("NOWAIT "); - if (flags & TQF_HEURISTICS) - __kmp_printf("HEURIST "); - if (flags & TQF_INTERFACE_RESERVED1) - __kmp_printf("RESERV1 "); - if (flags & TQF_INTERFACE_RESERVED2) - __kmp_printf("RESERV2 "); - if (flags & TQF_INTERFACE_RESERVED3) - __kmp_printf("RESERV3 "); - if (flags & TQF_INTERFACE_RESERVED4) - __kmp_printf("RESERV4 "); - if (flags & TQF_IS_LAST_TASK) - __kmp_printf("LAST_TASK "); - if (flags & TQF_TASKQ_TASK) - __kmp_printf("TASKQ_TASK "); - if (flags & TQF_RELEASE_WORKERS) - __kmp_printf("RELEASE "); - if (flags & TQF_ALL_TASKS_QUEUED) - __kmp_printf("ALL_QUEUED "); - if (flags & TQF_PARALLEL_CONTEXT) - __kmp_printf("PARALLEL "); - if (flags & TQF_DEALLOCATED) - __kmp_printf("DEALLOC "); - if (!(flags & (TQF_INTERNAL_FLAGS | TQF_INTERFACE_FLAGS))) - __kmp_printf("(NONE)"); -} - -static void __kmp_dump_thunk(kmp_taskq_t *tq, kmpc_thunk_t *thunk, - kmp_int32 global_tid) { - int i; - int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc; - - __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid); - - if (thunk != NULL) { - for (i = 0; i < nproc; i++) { - if (tq->tq_curr_thunk[i] == thunk) { - __kmp_printf("[%i] ", i); - } - } - __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds); - __kmp_printf("th_task=%p, ", thunk->th_task); - __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk); - __kmp_printf("th_status=%d, ", thunk->th_status); - __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum); - __kmp_printf("th_flags="); - __kmp_dump_TQF(thunk->th_flags); - } - - __kmp_printf("\n"); -} - -static void __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num) { - kmpc_thunk_t *th; - - __kmp_printf(" Thunk stack for T#%d: ", thread_num); - - for (th = thunk; th != NULL; th = th->th_encl_thunk) - __kmp_printf("%p ", th); - - __kmp_printf("\n"); -} - -static void __kmp_dump_task_queue(kmp_taskq_t *tq, kmpc_task_queue_t *queue, - kmp_int32 global_tid) { - int qs, count, i; - kmpc_thunk_t *thunk; - kmpc_task_queue_t *taskq; - - __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid); - - if (queue != NULL) { - int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT; - - if (__kmp_env_consistency_check) { - __kmp_printf(" tq_loc : "); - } - if (in_parallel) { - - // if (queue->tq.tq_parent != 0) - //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid); - - //__kmp_acquire_lock(& queue->tq_link_lck, global_tid); - - // Make sure data structures are in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent); - __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child); - __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child); - __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child); - __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count); - - //__kmp_release_lock(& queue->tq_link_lck, global_tid); - - // if (queue->tq.tq_parent != 0) - //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid); - - //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid); - //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid); - - // Make sure data structures are in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - } - - __kmp_printf(" tq_shareds : "); - for (i = 0; i < ((queue == tq->tq_root) ? queue->tq_nproc : 1); i++) - __kmp_printf("%p ", queue->tq_shareds[i].ai_data); - __kmp_printf("\n"); - - if (in_parallel) { - __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing); - __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving); - } - - __kmp_printf(" tq_queue : %p\n", queue->tq_queue); - __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space); - __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot); - - __kmp_printf(" tq_free_thunks : "); - for (thunk = queue->tq_free_thunks; thunk != NULL; - thunk = thunk->th.th_next_free) - __kmp_printf("%p ", thunk); - __kmp_printf("\n"); - - __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots); - __kmp_printf(" tq_head : %d\n", queue->tq_head); - __kmp_printf(" tq_tail : %d\n", queue->tq_tail); - __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull); - __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat); - __kmp_printf(" tq_flags : "); - __kmp_dump_TQF(queue->tq_flags); - __kmp_printf("\n"); - - if (in_parallel) { - __kmp_printf(" tq_th_thunks : "); - for (i = 0; i < queue->tq_nproc; i++) { - __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data); - } - __kmp_printf("\n"); - } - - __kmp_printf("\n"); - __kmp_printf(" Queue slots:\n"); - - qs = queue->tq_tail; - for (count = 0; count < queue->tq_nfull; ++count) { - __kmp_printf("(%d)", qs); - __kmp_dump_thunk(tq, queue->tq_queue[qs].qs_thunk, global_tid); - qs = (qs + 1) % queue->tq_nslots; - } - - __kmp_printf("\n"); - - if (in_parallel) { - if (queue->tq_taskq_slot != NULL) { - __kmp_printf(" TaskQ slot:\n"); - __kmp_dump_thunk(tq, CCAST(kmpc_thunk_t *, queue->tq_taskq_slot), - global_tid); - __kmp_printf("\n"); - } - //__kmp_release_lock(& queue->tq_queue_lck, global_tid); - //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid); - } - } - - __kmp_printf(" Taskq freelist: "); - - //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid ); - - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - for (taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free) - __kmp_printf("%p ", taskq); - - //__kmp_release_lock( & tq->tq_freelist_lck, global_tid ); - - __kmp_printf("\n\n"); -} - -static void __kmp_aux_dump_task_queue_tree(kmp_taskq_t *tq, - kmpc_task_queue_t *curr_queue, - kmp_int32 level, - kmp_int32 global_tid) { - int i, count, qs; - int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc; - kmpc_task_queue_t *queue = curr_queue; - - if (curr_queue == NULL) - return; - - __kmp_printf(" "); - - for (i = 0; i < level; i++) - __kmp_printf(" "); - - __kmp_printf("%p", curr_queue); - - for (i = 0; i < nproc; i++) { - if (tq->tq_curr_thunk[i] && - tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue) { - __kmp_printf(" [%i]", i); - } - } - - __kmp_printf(":"); - - //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid); - - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - qs = curr_queue->tq_tail; - - for (count = 0; count < curr_queue->tq_nfull; ++count) { - __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk); - qs = (qs + 1) % curr_queue->tq_nslots; - } - - //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid); - - __kmp_printf("\n"); - - if (curr_queue->tq_first_child) { - //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid); - - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - if (curr_queue->tq_first_child) { - for (queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child); - queue != NULL; queue = queue->tq_next_child) { - __kmp_aux_dump_task_queue_tree(tq, queue, level + 1, global_tid); - } - } - - //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid); - } -} - -static void __kmp_dump_task_queue_tree(kmp_taskq_t *tq, - kmpc_task_queue_t *tqroot, - kmp_int32 global_tid) { - __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid); - - __kmp_aux_dump_task_queue_tree(tq, tqroot, 0, global_tid); - - __kmp_printf("\n"); -} -#endif - -/* New taskq storage routines that try to minimize overhead of mallocs but - still provide cache line alignment. */ -static void *__kmp_taskq_allocate(size_t size, kmp_int32 global_tid) { - void *addr, *orig_addr; - size_t bytes; - - KB_TRACE(5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int)size, - global_tid)); - - bytes = sizeof(void *) + CACHE_LINE + size; - -#ifdef THREAD_ALLOC_FOR_TASKQ - orig_addr = - (void *)__kmp_thread_malloc(__kmp_thread_from_gtid(global_tid), bytes); -#else - KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", bytes)); - orig_addr = (void *)KMP_INTERNAL_MALLOC(bytes); -#endif /* THREAD_ALLOC_FOR_TASKQ */ - - if (orig_addr == 0) - KMP_FATAL(OutOfHeapMemory); - - addr = orig_addr; - - if (((kmp_uintptr_t)addr & (CACHE_LINE - 1)) != 0) { - KB_TRACE(50, ("__kmp_taskq_allocate: adjust for cache alignment\n")); - addr = (void *)(((kmp_uintptr_t)addr + CACHE_LINE) & ~(CACHE_LINE - 1)); - } - - (*(void **)addr) = orig_addr; - - KB_TRACE(10, - ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, " - "gtid: %d\n", - orig_addr, ((void **)addr) + 1, - ((char *)(((void **)addr) + 1)) + size - 1, (int)size, global_tid)); - - return (((void **)addr) + 1); -} - -static void __kmpc_taskq_free(void *p, kmp_int32 global_tid) { - KB_TRACE(5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid)); - - KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n", - (*(((void **)p) - 1)), global_tid)); - -#ifdef THREAD_ALLOC_FOR_TASKQ - __kmp_thread_free(__kmp_thread_from_gtid(global_tid), *(((void **)p) - 1)); -#else - KMP_INTERNAL_FREE(*(((void **)p) - 1)); -#endif /* THREAD_ALLOC_FOR_TASKQ */ -} - -/* Keep freed kmpc_task_queue_t on an internal freelist and recycle since - they're of constant size. */ - -static kmpc_task_queue_t * -__kmp_alloc_taskq(kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots, - kmp_int32 nthunks, kmp_int32 nshareds, kmp_int32 nproc, - size_t sizeof_thunk, size_t sizeof_shareds, - kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid) { - kmp_int32 i; - size_t bytes; - kmpc_task_queue_t *new_queue; - kmpc_aligned_shared_vars_t *shared_var_array; - char *shared_var_storage; - char *pt; /* for doing byte-adjusted address computations */ - - __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid); - - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - if (tq->tq_freelist) { - new_queue = tq->tq_freelist; - tq->tq_freelist = tq->tq_freelist->tq.tq_next_free; - - KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED); - - new_queue->tq_flags = 0; - - __kmp_release_lock(&tq->tq_freelist_lck, global_tid); - } else { - __kmp_release_lock(&tq->tq_freelist_lck, global_tid); - - new_queue = (kmpc_task_queue_t *)__kmp_taskq_allocate( - sizeof(kmpc_task_queue_t), global_tid); - new_queue->tq_flags = 0; - } - - /* space in the task queue for queue slots (allocate as one big chunk */ - /* of storage including new_taskq_task space) */ - - sizeof_thunk += - (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */ - pt = (char *)__kmp_taskq_allocate(nthunks * sizeof_thunk, global_tid); - new_queue->tq_thunk_space = (kmpc_thunk_t *)pt; - *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk); - - /* chain the allocated thunks into a freelist for this queue */ - - new_queue->tq_free_thunks = (kmpc_thunk_t *)pt; - - for (i = 0; i < (nthunks - 2); i++) { - ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th.th_next_free = - (kmpc_thunk_t *)(pt + (i + 1) * sizeof_thunk); -#ifdef KMP_DEBUG - ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th_flags = TQF_DEALLOCATED; -#endif - } - - ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th.th_next_free = NULL; -#ifdef KMP_DEBUG - ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th_flags = - TQF_DEALLOCATED; -#endif - - /* initialize the locks */ - - if (in_parallel) { - __kmp_init_lock(&new_queue->tq_link_lck); - __kmp_init_lock(&new_queue->tq_free_thunks_lck); - __kmp_init_lock(&new_queue->tq_queue_lck); - } - - /* now allocate the slots */ - - bytes = nslots * sizeof(kmpc_aligned_queue_slot_t); - new_queue->tq_queue = - (kmpc_aligned_queue_slot_t *)__kmp_taskq_allocate(bytes, global_tid); - - /* space for array of pointers to shared variable structures */ - sizeof_shareds += sizeof(kmpc_task_queue_t *); - sizeof_shareds += - (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */ - - bytes = nshareds * sizeof(kmpc_aligned_shared_vars_t); - shared_var_array = - (kmpc_aligned_shared_vars_t *)__kmp_taskq_allocate(bytes, global_tid); - - bytes = nshareds * sizeof_shareds; - shared_var_storage = (char *)__kmp_taskq_allocate(bytes, global_tid); - - for (i = 0; i < nshareds; i++) { - shared_var_array[i].ai_data = - (kmpc_shared_vars_t *)(shared_var_storage + i * sizeof_shareds); - shared_var_array[i].ai_data->sv_queue = new_queue; - } - new_queue->tq_shareds = shared_var_array; - - /* array for number of outstanding thunks per thread */ - - if (in_parallel) { - bytes = nproc * sizeof(kmpc_aligned_int32_t); - new_queue->tq_th_thunks = - (kmpc_aligned_int32_t *)__kmp_taskq_allocate(bytes, global_tid); - new_queue->tq_nproc = nproc; - - for (i = 0; i < nproc; i++) - new_queue->tq_th_thunks[i].ai_data = 0; - } - - return new_queue; -} - -static void __kmp_free_taskq(kmp_taskq_t *tq, kmpc_task_queue_t *p, - int in_parallel, kmp_int32 global_tid) { - __kmpc_taskq_free(p->tq_thunk_space, global_tid); - __kmpc_taskq_free(p->tq_queue, global_tid); - - /* free shared var structure storage */ - __kmpc_taskq_free(CCAST(kmpc_shared_vars_t *, p->tq_shareds[0].ai_data), - global_tid); - /* free array of pointers to shared vars storage */ - __kmpc_taskq_free(p->tq_shareds, global_tid); - -#ifdef KMP_DEBUG - p->tq_first_child = NULL; - p->tq_next_child = NULL; - p->tq_prev_child = NULL; - p->tq_ref_count = -10; - p->tq_shareds = NULL; - p->tq_tasknum_queuing = 0; - p->tq_tasknum_serving = 0; - p->tq_queue = NULL; - p->tq_thunk_space = NULL; - p->tq_taskq_slot = NULL; - p->tq_free_thunks = NULL; - p->tq_nslots = 0; - p->tq_head = 0; - p->tq_tail = 0; - p->tq_nfull = 0; - p->tq_hiwat = 0; - - if (in_parallel) { - int i; - - for (i = 0; i < p->tq_nproc; i++) - p->tq_th_thunks[i].ai_data = 0; - } - if (__kmp_env_consistency_check) - p->tq_loc = NULL; - KMP_DEBUG_ASSERT(p->tq_flags & TQF_DEALLOCATED); - p->tq_flags = TQF_DEALLOCATED; -#endif /* KMP_DEBUG */ - - if (in_parallel) { - __kmpc_taskq_free(p->tq_th_thunks, global_tid); - __kmp_destroy_lock(&p->tq_link_lck); - __kmp_destroy_lock(&p->tq_queue_lck); - __kmp_destroy_lock(&p->tq_free_thunks_lck); - } -#ifdef KMP_DEBUG - p->tq_th_thunks = NULL; -#endif /* KMP_DEBUG */ - - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid); - p->tq.tq_next_free = tq->tq_freelist; - - tq->tq_freelist = p; - __kmp_release_lock(&tq->tq_freelist_lck, global_tid); -} - -/* Once a group of thunks has been allocated for use in a particular queue, - these are managed via a per-queue freelist. - We force a check that there's always a thunk free if we need one. */ - -static kmpc_thunk_t *__kmp_alloc_thunk(kmpc_task_queue_t *queue, - int in_parallel, kmp_int32 global_tid) { - kmpc_thunk_t *fl; - - if (in_parallel) { - __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid); - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - } - - fl = queue->tq_free_thunks; - - KMP_DEBUG_ASSERT(fl != NULL); - - queue->tq_free_thunks = fl->th.th_next_free; - fl->th_flags = 0; - - if (in_parallel) - __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid); - - return fl; -} - -static void __kmp_free_thunk(kmpc_task_queue_t *queue, kmpc_thunk_t *p, - int in_parallel, kmp_int32 global_tid) { -#ifdef KMP_DEBUG - p->th_task = 0; - p->th_encl_thunk = 0; - p->th_status = 0; - p->th_tasknum = 0; -/* Also could zero pointers to private vars */ -#endif - - if (in_parallel) { - __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid); - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - } - - p->th.th_next_free = queue->tq_free_thunks; - queue->tq_free_thunks = p; - -#ifdef KMP_DEBUG - p->th_flags = TQF_DEALLOCATED; -#endif - - if (in_parallel) - __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid); -} - -/* returns nonzero if the queue just became full after the enqueue */ -static kmp_int32 __kmp_enqueue_task(kmp_taskq_t *tq, kmp_int32 global_tid, - kmpc_task_queue_t *queue, - kmpc_thunk_t *thunk, int in_parallel) { - kmp_int32 ret; - - /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the - * master is executing then) */ - if (in_parallel) { - __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - } - - KMP_DEBUG_ASSERT(queue->tq_nfull < queue->tq_nslots); // check queue not full - - queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk; - - if (queue->tq_head >= queue->tq_nslots) - queue->tq_head = 0; - - (queue->tq_nfull)++; - - KMP_MB(); /* to assure that nfull is seen to increase before - TQF_ALL_TASKS_QUEUED is set */ - - ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE; - - if (in_parallel) { - /* don't need to wait until workers are released before unlocking */ - __kmp_release_lock(&queue->tq_queue_lck, global_tid); - - if (tq->tq_global_flags & TQF_RELEASE_WORKERS) { - // If just creating the root queue, the worker threads are waiting at a - // join barrier until now, when there's something in the queue for them to - // do; release them now to do work. This should only be done when this is - // the first task enqueued, so reset the flag here also. - tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers - are still in spin mode */ - // avoid releasing barrier twice if taskq_task switches threads - KMP_MB(); - - __kmpc_end_barrier_master(NULL, global_tid); - } - } - - return ret; -} - -static kmpc_thunk_t *__kmp_dequeue_task(kmp_int32 global_tid, - kmpc_task_queue_t *queue, - int in_parallel) { - kmpc_thunk_t *pt; - int tid = __kmp_tid_from_gtid(global_tid); - - KMP_DEBUG_ASSERT(queue->tq_nfull > 0); /* check queue not empty */ - - if (queue->tq.tq_parent != NULL && in_parallel) { - int ct; - __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - ct = ++(queue->tq_ref_count); - __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - KMP_DEBUG_REF_CTS( - ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct)); - } - - pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk; - - if (queue->tq_tail >= queue->tq_nslots) - queue->tq_tail = 0; - - if (in_parallel) { - queue->tq_th_thunks[tid].ai_data++; - - KMP_MB(); /* necessary so ai_data increment is propagated to other threads - immediately (digital) */ - - KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding " - "thunks from queue %p\n", - global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue)); - } - - (queue->tq_nfull)--; - -#ifdef KMP_DEBUG - KMP_MB(); - - /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is - * decremented */ - - KMP_DEBUG_ASSERT(queue->tq_nfull >= 0); - - if (in_parallel) { - KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <= - __KMP_TASKQ_THUNKS_PER_TH); - } -#endif - - return pt; -} - -/* Find the next (non-null) task to dequeue and return it. - * This is never called unless in_parallel=TRUE - * - * Here are the rules for deciding which queue to take the task from: - * 1. Walk up the task queue tree from the current queue's parent and look - * on the way up (for loop, below). - * 2. Do a depth-first search back down the tree from the root and - * look (find_task_in_descendant_queue()). - * - * Here are the rules for deciding which task to take from a queue - * (__kmp_find_task_in_queue ()): - * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task - * must be staged to make sure we execute the last one with - * TQF_IS_LAST_TASK at the end of task queue execution. - * 2. If the queue length is below some high water mark and the taskq task - * is enqueued, prefer running the taskq task. - * 3. Otherwise, take a (normal) task from the queue. - * - * If we do all this and return pt == NULL at the bottom of this routine, - * this means there are no more tasks to execute (except possibly for - * TQF_IS_LASTPRIVATE). - */ - -static kmpc_thunk_t *__kmp_find_task_in_queue(kmp_int32 global_tid, - kmpc_task_queue_t *queue) { - kmpc_thunk_t *pt = NULL; - int tid = __kmp_tid_from_gtid(global_tid); - - /* To prevent deadlock from tq_queue_lck if queue already deallocated */ - if (!(queue->tq_flags & TQF_DEALLOCATED)) { - - __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); - - /* Check again to avoid race in __kmpc_end_taskq() */ - if (!(queue->tq_flags & TQF_DEALLOCATED)) { - // Make sure data structures are in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - if ((queue->tq_taskq_slot != NULL) && - (queue->tq_nfull <= queue->tq_hiwat)) { - /* if there's enough room in the queue and the dispatcher */ - /* (taskq task) is available, schedule more tasks */ - pt = CCAST(kmpc_thunk_t *, queue->tq_taskq_slot); - queue->tq_taskq_slot = NULL; - } else if (queue->tq_nfull == 0 || - queue->tq_th_thunks[tid].ai_data >= - __KMP_TASKQ_THUNKS_PER_TH) { - /* do nothing if no thunks available or this thread can't */ - /* run any because it already is executing too many */ - pt = NULL; - } else if (queue->tq_nfull > 1) { - /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */ - - pt = __kmp_dequeue_task(global_tid, queue, TRUE); - } else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) { - // one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE - pt = __kmp_dequeue_task(global_tid, queue, TRUE); - } else if (queue->tq_flags & TQF_IS_LAST_TASK) { - /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */ - /* has been run so this is last task, run with TQF_IS_LAST_TASK so */ - /* instrumentation does copy-out. */ - pt = __kmp_dequeue_task(global_tid, queue, TRUE); - pt->th_flags |= - TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */ - } - } - - /* GEH - What happens here if is lastprivate, but not last task? */ - __kmp_release_lock(&queue->tq_queue_lck, global_tid); - } - - return pt; -} - -/* Walk a tree of queues starting at queue's first child and return a non-NULL - thunk if one can be scheduled. Must only be called when in_parallel=TRUE */ - -static kmpc_thunk_t * -__kmp_find_task_in_descendant_queue(kmp_int32 global_tid, - kmpc_task_queue_t *curr_queue) { - kmpc_thunk_t *pt = NULL; - kmpc_task_queue_t *queue = curr_queue; - - if (curr_queue->tq_first_child != NULL) { - __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child); - if (queue == NULL) { - __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); - return NULL; - } - - while (queue != NULL) { - int ct; - kmpc_task_queue_t *next; - - ct = ++(queue->tq_ref_count); - __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); - KMP_DEBUG_REF_CTS( - ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct)); - - pt = __kmp_find_task_in_queue(global_tid, queue); - - if (pt != NULL) { - int ct; - - __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); - // Make sure data structures in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - ct = --(queue->tq_ref_count); - KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__, - global_tid, queue, ct)); - KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0); - - __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); - - return pt; - } - - /* although reference count stays active during descendant walk, shouldn't - matter since if children still exist, reference counts aren't being - monitored anyway */ - - pt = __kmp_find_task_in_descendant_queue(global_tid, queue); - - if (pt != NULL) { - int ct; - - __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); - // Make sure data structures in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - ct = --(queue->tq_ref_count); - KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__, - global_tid, queue, ct)); - KMP_DEBUG_ASSERT(ct >= 0); - - __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); - - return pt; - } - - __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); - // Make sure data structures in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - next = queue->tq_next_child; - - ct = --(queue->tq_ref_count); - KMP_DEBUG_REF_CTS( - ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct)); - KMP_DEBUG_ASSERT(ct >= 0); - - queue = next; - } - - __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); - } - - return pt; -} - -/* Walk up the taskq tree looking for a task to execute. If we get to the root, - search the tree for a descendent queue task. Must only be called when - in_parallel=TRUE */ -static kmpc_thunk_t * -__kmp_find_task_in_ancestor_queue(kmp_taskq_t *tq, kmp_int32 global_tid, - kmpc_task_queue_t *curr_queue) { - kmpc_task_queue_t *queue; - kmpc_thunk_t *pt; - - pt = NULL; - - if (curr_queue->tq.tq_parent != NULL) { - queue = curr_queue->tq.tq_parent; - - while (queue != NULL) { - if (queue->tq.tq_parent != NULL) { - int ct; - __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - // Make sure data structures in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - ct = ++(queue->tq_ref_count); - __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n", __LINE__, - global_tid, queue, ct)); - } - - pt = __kmp_find_task_in_queue(global_tid, queue); - if (pt != NULL) { - if (queue->tq.tq_parent != NULL) { - int ct; - __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - // Make sure data structures in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - ct = --(queue->tq_ref_count); - KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__, - global_tid, queue, ct)); - KMP_DEBUG_ASSERT(ct >= 0); - - __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - } - - return pt; - } - - if (queue->tq.tq_parent != NULL) { - int ct; - __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - // Make sure data structures in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - ct = --(queue->tq_ref_count); - KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__, - global_tid, queue, ct)); - KMP_DEBUG_ASSERT(ct >= 0); - } - queue = queue->tq.tq_parent; - - if (queue != NULL) - __kmp_release_lock(&queue->tq_link_lck, global_tid); - } - } - - pt = __kmp_find_task_in_descendant_queue(global_tid, tq->tq_root); - - return pt; -} - -static int __kmp_taskq_tasks_finished(kmpc_task_queue_t *queue) { - int i; - - /* KMP_MB(); */ /* is this really necessary? */ - - for (i = 0; i < queue->tq_nproc; i++) { - if (queue->tq_th_thunks[i].ai_data != 0) - return FALSE; - } - - return TRUE; -} - -static int __kmp_taskq_has_any_children(kmpc_task_queue_t *queue) { - return (queue->tq_first_child != NULL); -} - -static void __kmp_remove_queue_from_tree(kmp_taskq_t *tq, kmp_int32 global_tid, - kmpc_task_queue_t *queue, - int in_parallel) { -#ifdef KMP_DEBUG - kmp_int32 i; - kmpc_thunk_t *thunk; -#endif - - KF_TRACE(50, - ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid)); - KF_DUMP(50, __kmp_dump_task_queue(tq, queue, global_tid)); - - /* sub-queue in a recursion, not the root task queue */ - KMP_DEBUG_ASSERT(queue->tq.tq_parent != NULL); - - if (in_parallel) { - __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - } - - KMP_DEBUG_ASSERT(queue->tq_first_child == NULL); - - /* unlink queue from its siblings if any at this level */ - if (queue->tq_prev_child != NULL) - queue->tq_prev_child->tq_next_child = queue->tq_next_child; - if (queue->tq_next_child != NULL) - queue->tq_next_child->tq_prev_child = queue->tq_prev_child; - if (queue->tq.tq_parent->tq_first_child == queue) - queue->tq.tq_parent->tq_first_child = queue->tq_next_child; - - queue->tq_prev_child = NULL; - queue->tq_next_child = NULL; - - if (in_parallel) { - KMP_DEBUG_REF_CTS( - ("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n", - __LINE__, global_tid, queue, queue->tq_ref_count)); - - /* wait until all other threads have stopped accessing this queue */ - while (queue->tq_ref_count > 1) { - __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - - KMP_WAIT((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE, NULL); - - __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - // Make sure data structures are in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - } - - __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - } - - KMP_DEBUG_REF_CTS( - ("line %d gtid %d: Q %p freeing queue\n", __LINE__, global_tid, queue)); - -#ifdef KMP_DEBUG - KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED); - KMP_DEBUG_ASSERT(queue->tq_nfull == 0); - - for (i = 0; i < queue->tq_nproc; i++) { - KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0); - } - - i = 0; - for (thunk = queue->tq_free_thunks; thunk != NULL; - thunk = thunk->th.th_next_free) - ++i; - - KMP_ASSERT(i == - queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH)); -#endif - - /* release storage for queue entry */ - __kmp_free_taskq(tq, queue, TRUE, global_tid); - - KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid)); - KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid)); -} - -/* Starting from indicated queue, proceed downward through tree and remove all - taskqs which are finished, but only go down to taskqs which have the "nowait" - clause present. Assume this is only called when in_parallel=TRUE. */ - -static void __kmp_find_and_remove_finished_child_taskq( - kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue) { - kmpc_task_queue_t *queue = curr_queue; - - if (curr_queue->tq_first_child != NULL) { - __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); - // Make sure data structures are in consistent state before querying them - // Seems to work without this call for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child); - if (queue != NULL) { - __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); - return; - } - - while (queue != NULL) { - kmpc_task_queue_t *next; - int ct = ++(queue->tq_ref_count); - KMP_DEBUG_REF_CTS( - ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct)); - - /* although reference count stays active during descendant walk, */ - /* shouldn't matter since if children still exist, reference */ - /* counts aren't being monitored anyway */ - - if (queue->tq_flags & TQF_IS_NOWAIT) { - __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue); - - if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) && - (queue->tq_nfull == 0) && __kmp_taskq_tasks_finished(queue) && - !__kmp_taskq_has_any_children(queue)) { - - /* Only remove this if we have not already marked it for deallocation. - This should prevent multiple threads from trying to free this. */ - - if (__kmp_test_lock(&queue->tq_queue_lck, global_tid)) { - if (!(queue->tq_flags & TQF_DEALLOCATED)) { - queue->tq_flags |= TQF_DEALLOCATED; - __kmp_release_lock(&queue->tq_queue_lck, global_tid); - - __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE); - - /* Can't do any more here since can't be sure where sibling queue - * is so just exit this level */ - return; - } else { - __kmp_release_lock(&queue->tq_queue_lck, global_tid); - } - } - /* otherwise, just fall through and decrement reference count */ - } - } - - __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); - // Make sure data structures are in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - next = queue->tq_next_child; - - ct = --(queue->tq_ref_count); - KMP_DEBUG_REF_CTS( - ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct)); - KMP_DEBUG_ASSERT(ct >= 0); - - queue = next; - } - - __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); - } -} - -/* Starting from indicated queue, proceed downward through tree and remove all - taskq's assuming all are finished and assuming NO other threads are executing - at this point. */ -static void __kmp_remove_all_child_taskq(kmp_taskq_t *tq, kmp_int32 global_tid, - kmpc_task_queue_t *queue) { - kmpc_task_queue_t *next_child; - - queue = CCAST(kmpc_task_queue_t *, queue->tq_first_child); - - while (queue != NULL) { - __kmp_remove_all_child_taskq(tq, global_tid, queue); - - next_child = queue->tq_next_child; - queue->tq_flags |= TQF_DEALLOCATED; - __kmp_remove_queue_from_tree(tq, global_tid, queue, FALSE); - queue = next_child; - } -} - -static void __kmp_execute_task_from_queue(kmp_taskq_t *tq, ident_t *loc, - kmp_int32 global_tid, - kmpc_thunk_t *thunk, - int in_parallel) { - kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue; - kmp_int32 tid = __kmp_tid_from_gtid(global_tid); - - KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid)); - KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid)); - KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid)); - KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); - - /* For the taskq task, the curr_thunk pushes and pop pairs are set up as - * follows: - * - * happens exactly once: - * 1) __kmpc_taskq : push (if returning thunk only) - * 4) __kmpc_end_taskq_task : pop - * - * optionally happens *each* time taskq task is dequeued/enqueued: - * 2) __kmpc_taskq_task : pop - * 3) __kmp_execute_task_from_queue : push - * - * execution ordering: 1,(2,3)*,4 - */ - - if (!(thunk->th_flags & TQF_TASKQ_TASK)) { - kmp_int32 index = (queue == tq->tq_root) ? tid : 0; - thunk->th.th_shareds = - CCAST(kmpc_shared_vars_t *, queue->tq_shareds[index].ai_data); - - if (__kmp_env_consistency_check) { - __kmp_push_workshare(global_tid, - (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered - : ct_task, - queue->tq_loc); - } - } else { - if (__kmp_env_consistency_check) - __kmp_push_workshare(global_tid, ct_taskq, queue->tq_loc); - } - - if (in_parallel) { - thunk->th_encl_thunk = tq->tq_curr_thunk[tid]; - tq->tq_curr_thunk[tid] = thunk; - - KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); - } - - KF_TRACE(50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk, - queue, global_tid)); - thunk->th_task(global_tid, thunk); - KF_TRACE(50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue, - global_tid)); - - if (!(thunk->th_flags & TQF_TASKQ_TASK)) { - if (__kmp_env_consistency_check) - __kmp_pop_workshare(global_tid, - (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered - : ct_task, - queue->tq_loc); - - if (in_parallel) { - tq->tq_curr_thunk[tid] = thunk->th_encl_thunk; - thunk->th_encl_thunk = NULL; - KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); - } - - if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) { - __kmp_taskq_check_ordered(global_tid, thunk); - } - - __kmp_free_thunk(queue, thunk, in_parallel, global_tid); - - KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n", - global_tid, thunk)); - KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); - - if (in_parallel) { - KMP_MB(); /* needed so thunk put on free list before outstanding thunk - count is decremented */ - - KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1); - - KF_TRACE( - 200, - ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n", - global_tid, queue->tq_th_thunks[tid].ai_data - 1, queue)); - - queue->tq_th_thunks[tid].ai_data--; - - /* KMP_MB(); */ /* is MB really necessary ? */ - } - - if (queue->tq.tq_parent != NULL && in_parallel) { - int ct; - __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - ct = --(queue->tq_ref_count); - __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - KMP_DEBUG_REF_CTS( - ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct)); - KMP_DEBUG_ASSERT(ct >= 0); - } - } -} - -/* starts a taskq; creates and returns a thunk for the taskq_task */ -/* also, returns pointer to shared vars for this thread in "shareds" arg */ -kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid, - kmpc_task_t taskq_task, size_t sizeof_thunk, - size_t sizeof_shareds, kmp_int32 flags, - kmpc_shared_vars_t **shareds) { - int in_parallel; - kmp_int32 nslots, nthunks, nshareds, nproc; - kmpc_task_queue_t *new_queue, *curr_queue; - kmpc_thunk_t *new_taskq_thunk; - kmp_info_t *th; - kmp_team_t *team; - kmp_taskq_t *tq; - kmp_int32 tid; - - KE_TRACE(10, ("__kmpc_taskq called (%d)\n", global_tid)); - - th = __kmp_threads[global_tid]; - team = th->th.th_team; - tq = &team->t.t_taskq; - nproc = team->t.t_nproc; - tid = __kmp_tid_from_gtid(global_tid); - - /* find out whether this is a parallel taskq or serialized one. */ - in_parallel = in_parallel_context(team); - - if (!tq->tq_root) { - if (in_parallel) { - /* Vector ORDERED SECTION to taskq version */ - th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo; - - /* Vector ORDERED SECTION to taskq version */ - th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo; - } - - if (in_parallel) { - // This shouldn't be a barrier region boundary, it will confuse the user. - /* Need the boundary to be at the end taskq instead. */ - if (__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) { - /* Creating the active root queue, and we are not the master thread. */ - /* The master thread below created the queue and tasks have been */ - /* enqueued, and the master thread released this barrier. This */ - /* worker thread can now proceed and execute tasks. See also the */ - /* TQF_RELEASE_WORKERS which is used to handle this case. */ - *shareds = - CCAST(kmpc_shared_vars_t *, tq->tq_root->tq_shareds[tid].ai_data); - KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid)); - - return NULL; - } - } - - /* master thread only executes this code */ - if (tq->tq_curr_thunk_capacity < nproc) { - if (tq->tq_curr_thunk) - __kmp_free(tq->tq_curr_thunk); - else { - /* only need to do this once at outer level, i.e. when tq_curr_thunk is - * still NULL */ - __kmp_init_lock(&tq->tq_freelist_lck); - } - - tq->tq_curr_thunk = - (kmpc_thunk_t **)__kmp_allocate(nproc * sizeof(kmpc_thunk_t *)); - tq->tq_curr_thunk_capacity = nproc; - } - - if (in_parallel) - tq->tq_global_flags = TQF_RELEASE_WORKERS; - } - - /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */ - /* on some heuristics (e.g., depth of queue nesting?). */ - nslots = (in_parallel) ? (2 * nproc) : 1; - - /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */ - /* jobs being executed by other threads, and one extra for taskq slot */ - nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1) - : nslots + 2; - - /* Only the root taskq gets a per-thread array of shareds. */ - /* The rest of the taskq's only get one copy of the shared vars. */ - nshareds = (!tq->tq_root && in_parallel) ? nproc : 1; - - /* create overall queue data structure and its components that require - * allocation */ - new_queue = __kmp_alloc_taskq(tq, in_parallel, nslots, nthunks, nshareds, - nproc, sizeof_thunk, sizeof_shareds, - &new_taskq_thunk, global_tid); - - /* rest of new_queue initializations */ - new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS; - - if (in_parallel) { - new_queue->tq_tasknum_queuing = 0; - new_queue->tq_tasknum_serving = 0; - new_queue->tq_flags |= TQF_PARALLEL_CONTEXT; - } - - new_queue->tq_taskq_slot = NULL; - new_queue->tq_nslots = nslots; - new_queue->tq_hiwat = HIGH_WATER_MARK(nslots); - new_queue->tq_nfull = 0; - new_queue->tq_head = 0; - new_queue->tq_tail = 0; - new_queue->tq_loc = loc; - - if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) { - /* prepare to serve the first-queued task's ORDERED directive */ - new_queue->tq_tasknum_serving = 1; - - /* Vector ORDERED SECTION to taskq version */ - th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo; - - /* Vector ORDERED SECTION to taskq version */ - th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo; - } - - /* create a new thunk for the taskq_task in the new_queue */ - *shareds = CCAST(kmpc_shared_vars_t *, new_queue->tq_shareds[0].ai_data); - - new_taskq_thunk->th.th_shareds = *shareds; - new_taskq_thunk->th_task = taskq_task; - new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK; - new_taskq_thunk->th_status = 0; - - KMP_DEBUG_ASSERT(new_taskq_thunk->th_flags & TQF_TASKQ_TASK); - - // Make sure these inits complete before threads start using this queue - /* KMP_MB(); */ // (necessary?) - - /* insert the new task queue into the tree, but only after all fields - * initialized */ - - if (in_parallel) { - if (!tq->tq_root) { - new_queue->tq.tq_parent = NULL; - new_queue->tq_first_child = NULL; - new_queue->tq_next_child = NULL; - new_queue->tq_prev_child = NULL; - new_queue->tq_ref_count = 1; - tq->tq_root = new_queue; - } else { - curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue; - new_queue->tq.tq_parent = curr_queue; - new_queue->tq_first_child = NULL; - new_queue->tq_prev_child = NULL; - new_queue->tq_ref_count = - 1; /* for this the thread that built the queue */ - - KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n", __LINE__, - global_tid, new_queue, new_queue->tq_ref_count)); - - __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); - - // Make sure data structures are in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - new_queue->tq_next_child = - CCAST(struct kmpc_task_queue_t *, curr_queue->tq_first_child); - - if (curr_queue->tq_first_child != NULL) - curr_queue->tq_first_child->tq_prev_child = new_queue; - - curr_queue->tq_first_child = new_queue; - - __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); - } - - /* set up thunk stack only after code that determines curr_queue above */ - new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid]; - tq->tq_curr_thunk[tid] = new_taskq_thunk; - - KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); - } else { - new_taskq_thunk->th_encl_thunk = 0; - new_queue->tq.tq_parent = NULL; - new_queue->tq_first_child = NULL; - new_queue->tq_next_child = NULL; - new_queue->tq_prev_child = NULL; - new_queue->tq_ref_count = 1; - } - -#ifdef KMP_DEBUG - KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid)); - KF_DUMP(150, __kmp_dump_thunk(tq, new_taskq_thunk, global_tid)); - - if (in_parallel) { - KF_TRACE(25, - ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid)); - } else { - KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue, - global_tid)); - } - - KF_DUMP(25, __kmp_dump_task_queue(tq, new_queue, global_tid)); - - if (in_parallel) { - KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid)); - } -#endif /* KMP_DEBUG */ - - if (__kmp_env_consistency_check) - __kmp_push_workshare(global_tid, ct_taskq, new_queue->tq_loc); - - KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid)); - - return new_taskq_thunk; -} - -/* ends a taskq; last thread out destroys the queue */ - -void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, - kmpc_thunk_t *taskq_thunk) { -#ifdef KMP_DEBUG - kmp_int32 i; -#endif - kmp_taskq_t *tq; - int in_parallel; - kmp_info_t *th; - kmp_int32 is_outermost; - kmpc_task_queue_t *queue; - kmpc_thunk_t *thunk; - int nproc; - - KE_TRACE(10, ("__kmpc_end_taskq called (%d)\n", global_tid)); - - tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; - nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc; - - /* For the outermost taskq only, all but one thread will have taskq_thunk == - * NULL */ - queue = (taskq_thunk == NULL) ? tq->tq_root - : taskq_thunk->th.th_shareds->sv_queue; - - KE_TRACE(50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid)); - is_outermost = (queue == tq->tq_root); - in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT); - - if (in_parallel) { - /* this is just a safeguard to release the waiting threads if */ - /* the outermost taskq never queues a task */ - - if (is_outermost && (KMP_MASTER_GTID(global_tid))) { - if (tq->tq_global_flags & TQF_RELEASE_WORKERS) { - /* no lock needed, workers are still in spin mode */ - tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; - - __kmp_end_split_barrier(bs_plain_barrier, global_tid); - } - } - - /* keep dequeueing work until all tasks are queued and dequeued */ - - do { - /* wait until something is available to dequeue */ - while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) && - (!__kmp_taskq_has_any_children(queue)) && - (!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) { - KMP_CPU_PAUSE(); - } - - /* check to see if we can execute tasks in the queue */ - while (((queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL)) && - (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL) { - KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk, - queue, global_tid)); - __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); - } - - /* see if work found can be found in a descendant queue */ - if ((__kmp_taskq_has_any_children(queue)) && - (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != - NULL) { - - KF_TRACE(50, - ("Stole thunk: %p in descendant queue: %p while waiting in " - "queue: %p (%d)\n", - thunk, thunk->th.th_shareds->sv_queue, queue, global_tid)); - - __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); - } - - } while ((!(queue->tq_flags & TQF_ALL_TASKS_QUEUED)) || - (queue->tq_nfull != 0)); - - KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue, - global_tid)); - - /* wait while all tasks are not finished and more work found - in descendant queues */ - - while ((!__kmp_taskq_tasks_finished(queue)) && - (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != - NULL) { - - KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in " - "queue: %p (%d)\n", - thunk, thunk->th.th_shareds->sv_queue, queue, global_tid)); - - __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); - } - - KF_TRACE(50, ("No work found in descendent queues or all work finished in " - "queue: %p (%d)\n", - queue, global_tid)); - - if (!is_outermost) { - /* need to return if NOWAIT present and not outermost taskq */ - - if (queue->tq_flags & TQF_IS_NOWAIT) { - __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - queue->tq_ref_count--; - KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0); - __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); - - KE_TRACE( - 10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid)); - - return; - } - - __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue); - - /* WAIT until all tasks are finished and no child queues exist before - * proceeding */ - - while (!__kmp_taskq_tasks_finished(queue) || - __kmp_taskq_has_any_children(queue)) { - thunk = __kmp_find_task_in_ancestor_queue(tq, global_tid, queue); - - if (thunk != NULL) { - KF_TRACE(50, - ("Stole thunk: %p in ancestor queue: %p while waiting in " - "queue: %p (%d)\n", - thunk, thunk->th.th_shareds->sv_queue, queue, global_tid)); - __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, - in_parallel); - } - - if (thunk == NULL) - KMP_CPU_PAUSE(); - - __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue); - } - - __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); - if (!(queue->tq_flags & TQF_DEALLOCATED)) { - queue->tq_flags |= TQF_DEALLOCATED; - } - __kmp_release_lock(&queue->tq_queue_lck, global_tid); - - /* only the allocating thread can deallocate the queue */ - if (taskq_thunk != NULL) { - __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE); - } - - KE_TRACE( - 10, - ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n", - global_tid)); - - return; - } - - // Outermost Queue: steal work from descendants until all tasks are finished - - while (!__kmp_taskq_tasks_finished(queue)) { - thunk = __kmp_find_task_in_descendant_queue(global_tid, queue); - - if (thunk != NULL) { - KF_TRACE(50, - ("Stole thunk: %p in descendant queue: %p while waiting in " - "queue: %p (%d)\n", - thunk, thunk->th.th_shareds->sv_queue, queue, global_tid)); - - __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); - } - - if (thunk == NULL) - KMP_CPU_PAUSE(); - } - - /* Need this barrier to prevent destruction of queue before threads have all - * executed above code */ - /* This may need to be done earlier when NOWAIT is implemented for the - * outermost level */ - - if (!__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) { - /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */ - /* for right now, everybody waits, and the master thread destroys the */ - /* remaining queues. */ - - __kmp_remove_all_child_taskq(tq, global_tid, queue); - - /* Now destroy the root queue */ - KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n", - global_tid, queue)); - KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); - -#ifdef KMP_DEBUG - /* the root queue entry */ - KMP_DEBUG_ASSERT((queue->tq.tq_parent == NULL) && - (queue->tq_next_child == NULL)); - - /* children must all be gone by now because of barrier above */ - KMP_DEBUG_ASSERT(queue->tq_first_child == NULL); - - for (i = 0; i < nproc; i++) { - KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0); - } - - for (i = 0, thunk = queue->tq_free_thunks; thunk != NULL; - i++, thunk = thunk->th.th_next_free) - ; - - KMP_DEBUG_ASSERT(i == - queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH)); - - for (i = 0; i < nproc; i++) { - KMP_DEBUG_ASSERT(!tq->tq_curr_thunk[i]); - } -#endif - /* unlink the root queue entry */ - tq->tq_root = NULL; - - /* release storage for root queue entry */ - KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue, - global_tid)); - - queue->tq_flags |= TQF_DEALLOCATED; - __kmp_free_taskq(tq, queue, in_parallel, global_tid); - - KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid)); - - /* release the workers now that the data structures are up to date */ - __kmp_end_split_barrier(bs_plain_barrier, global_tid); - } - - th = __kmp_threads[global_tid]; - - /* Reset ORDERED SECTION to parallel version */ - th->th.th_dispatch->th_deo_fcn = 0; - - /* Reset ORDERED SECTION to parallel version */ - th->th.th_dispatch->th_dxo_fcn = 0; - } else { - /* in serial execution context, dequeue the last task */ - /* and execute it, if there were any tasks encountered */ - - if (queue->tq_nfull > 0) { - KMP_DEBUG_ASSERT(queue->tq_nfull == 1); - - thunk = __kmp_dequeue_task(global_tid, queue, in_parallel); - - if (queue->tq_flags & TQF_IS_LAST_TASK) { - /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */ - /* has been run so this is last task, run with TQF_IS_LAST_TASK so */ - /* instrumentation does copy-out. */ - - /* no need for test_then_or call since already locked */ - thunk->th_flags |= TQF_IS_LAST_TASK; - } - - KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, - thunk, queue)); - - __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); - } - - // destroy the unattached serial queue now that there is no more work to do - KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n", - queue, global_tid)); - KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); - -#ifdef KMP_DEBUG - i = 0; - for (thunk = queue->tq_free_thunks; thunk != NULL; - thunk = thunk->th.th_next_free) - ++i; - KMP_DEBUG_ASSERT(i == queue->tq_nslots + 1); -#endif - /* release storage for unattached serial queue */ - KF_TRACE(50, - ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid)); - - queue->tq_flags |= TQF_DEALLOCATED; - __kmp_free_taskq(tq, queue, in_parallel, global_tid); - } - - KE_TRACE(10, ("__kmpc_end_taskq return (%d)\n", global_tid)); -} - -/* Enqueues a task for thunk previously created by __kmpc_task_buffer. */ -/* Returns nonzero if just filled up queue */ - -kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk) { - kmp_int32 ret; - kmpc_task_queue_t *queue; - int in_parallel; - kmp_taskq_t *tq; - - KE_TRACE(10, ("__kmpc_task called (%d)\n", global_tid)); - - KMP_DEBUG_ASSERT(!(thunk->th_flags & - TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */ - - tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; - queue = thunk->th.th_shareds->sv_queue; - in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT); - - if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED)) - thunk->th_tasknum = ++queue->tq_tasknum_queuing; - - /* For serial execution dequeue the preceding task and execute it, if one - * exists */ - /* This cannot be the last task. That one is handled in __kmpc_end_taskq */ - - if (!in_parallel && queue->tq_nfull > 0) { - kmpc_thunk_t *prev_thunk; - - KMP_DEBUG_ASSERT(queue->tq_nfull == 1); - - prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel); - - KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, - prev_thunk, queue)); - - __kmp_execute_task_from_queue(tq, loc, global_tid, prev_thunk, in_parallel); - } - - /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private - variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the - task queue is not full and allocates a thunk (which is then passed to - __kmpc_task()). So, the enqueue below should never fail due to a full - queue. */ - - KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid)); - KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid)); - - ret = __kmp_enqueue_task(tq, global_tid, queue, thunk, in_parallel); - - KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid)); - KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); - - KE_TRACE(10, ("__kmpc_task return (%d)\n", global_tid)); - - return ret; -} - -/* enqueues a taskq_task for thunk previously created by __kmpc_taskq */ -/* this should never be called unless in a parallel context */ - -void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, - kmp_int32 status) { - kmpc_task_queue_t *queue; - kmp_taskq_t *tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; - int tid = __kmp_tid_from_gtid(global_tid); - - KE_TRACE(10, ("__kmpc_taskq_task called (%d)\n", global_tid)); - KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid)); - KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid)); - - queue = thunk->th.th_shareds->sv_queue; - - if (__kmp_env_consistency_check) - __kmp_pop_workshare(global_tid, ct_taskq, loc); - - /* thunk->th_task is the taskq_task */ - KMP_DEBUG_ASSERT(thunk->th_flags & TQF_TASKQ_TASK); - - /* not supposed to call __kmpc_taskq_task if it's already enqueued */ - KMP_DEBUG_ASSERT(queue->tq_taskq_slot == NULL); - - /* dequeue taskq thunk from curr_thunk stack */ - tq->tq_curr_thunk[tid] = thunk->th_encl_thunk; - thunk->th_encl_thunk = NULL; - - KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); - - thunk->th_status = status; - - // Flush thunk->th_status before taskq_task enqueued to avoid race condition - KMP_MB(); - - /* enqueue taskq_task in thunk into special slot in queue */ - /* GEH - probably don't need to lock taskq slot since only one */ - /* thread enqueues & already a lock set at dequeue point */ - - queue->tq_taskq_slot = thunk; - - KE_TRACE(10, ("__kmpc_taskq_task return (%d)\n", global_tid)); -} - -/* ends a taskq_task; done generating tasks */ - -void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid, - kmpc_thunk_t *thunk) { - kmp_taskq_t *tq; - kmpc_task_queue_t *queue; - int in_parallel; - int tid; - - KE_TRACE(10, ("__kmpc_end_taskq_task called (%d)\n", global_tid)); - - tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; - queue = thunk->th.th_shareds->sv_queue; - in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT); - tid = __kmp_tid_from_gtid(global_tid); - - if (__kmp_env_consistency_check) - __kmp_pop_workshare(global_tid, ct_taskq, loc); - - if (in_parallel) { -#if KMP_ARCH_X86 || KMP_ARCH_X86_64 - KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags), - TQF_ALL_TASKS_QUEUED); -#else - { - __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); - - // Make sure data structures are in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - queue->tq_flags |= TQF_ALL_TASKS_QUEUED; - __kmp_release_lock(&queue->tq_queue_lck, global_tid); - } -#endif - } - - if (thunk->th_flags & TQF_IS_LASTPRIVATE) { - /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in - the queue if TQF_IS_LASTPRIVATE so we can positively identify that last - task and run it with its TQF_IS_LAST_TASK bit turned on in th_flags. - When __kmpc_end_taskq_task() is called we are done generating all the - tasks, so we know the last one in the queue is the lastprivate task. - Mark the queue as having gotten to this state via tq_flags & - TQF_IS_LAST_TASK; when that task actually executes mark it via th_flags & - TQF_IS_LAST_TASK (this th_flags bit signals the instrumented code to do - copy-outs after execution). */ - if (!in_parallel) { - /* No synchronization needed for serial context */ - queue->tq_flags |= TQF_IS_LAST_TASK; - } else { -#if KMP_ARCH_X86 || KMP_ARCH_X86_64 - KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags), - TQF_IS_LAST_TASK); -#else - { - __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); - - // Make sure data structures in consistent state before querying them - // Seems to work without this for digital/alpha, needed for IBM/RS6000 - KMP_MB(); - - queue->tq_flags |= TQF_IS_LAST_TASK; - __kmp_release_lock(&queue->tq_queue_lck, global_tid); - } -#endif - /* to prevent race condition where last task is dequeued but */ - /* flag isn't visible yet (not sure about this) */ - KMP_MB(); - } - } - - /* dequeue taskq thunk from curr_thunk stack */ - if (in_parallel) { - tq->tq_curr_thunk[tid] = thunk->th_encl_thunk; - thunk->th_encl_thunk = NULL; - - KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); - } - - KE_TRACE(10, ("__kmpc_end_taskq_task return (%d)\n", global_tid)); -} - -/* returns thunk for a regular task based on taskq_thunk */ -/* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */ - -kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid, - kmpc_thunk_t *taskq_thunk, kmpc_task_t task) { - kmp_taskq_t *tq; - kmpc_task_queue_t *queue; - kmpc_thunk_t *new_thunk; - int in_parallel; - - KE_TRACE(10, ("__kmpc_task_buffer called (%d)\n", global_tid)); - - KMP_DEBUG_ASSERT( - taskq_thunk->th_flags & - TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */ - - tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; - queue = taskq_thunk->th.th_shareds->sv_queue; - in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT); - - /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private - variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the - task queue is not full and allocates a thunk (which is then passed to - __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be - the next to be enqueued in __kmpc_task(). */ - - new_thunk = __kmp_alloc_thunk(queue, in_parallel, global_tid); - new_thunk->th.th_shareds = - CCAST(kmpc_shared_vars_t *, queue->tq_shareds[0].ai_data); - new_thunk->th_encl_thunk = NULL; - new_thunk->th_task = task; - - /* GEH - shouldn't need to lock the read of tq_flags here */ - new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS; - - new_thunk->th_status = 0; - - KMP_DEBUG_ASSERT(!(new_thunk->th_flags & TQF_TASKQ_TASK)); - - KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid)); - KF_DUMP(100, __kmp_dump_thunk(tq, new_thunk, global_tid)); - - KE_TRACE(10, ("__kmpc_task_buffer return (%d)\n", global_tid)); - - return new_thunk; -}