Index: runtime/src/kmp.h =================================================================== --- runtime/src/kmp.h +++ runtime/src/kmp.h @@ -3023,11 +3023,17 @@ kmp_team_t *team, int tid); #if OMP_40_ENABLED extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, +#if OMPT_SUPPORT + ompt_parallel_id_t ompt_parallel_id, +#endif kmp_proc_bind_t proc_bind, kmp_internal_control_t *new_icvs, int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) ); #else extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, +#if OMPT_SUPPORT + ompt_parallel_id_t ompt_parallel_id, +#endif kmp_internal_control_t *new_icvs, int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) ); #endif // OMP_40_ENABLED @@ -3062,7 +3068,7 @@ fork_context_last }; extern int __kmp_fork_call( ident_t *loc, int gtid, enum fork_context_e fork_context, - kmp_int32 argc, microtask_t microtask, launch_t invoker, + kmp_int32 argc, void *unwrapped_task, microtask_t microtask, launch_t invoker, /* TODO: revert workaround for Intel(R) 64 tracker #96 */ #if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && KMP_OS_LINUX va_list *ap @@ -3172,7 +3178,11 @@ #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ -extern int __kmp_invoke_microtask( microtask_t pkfn, int gtid, int npr, int argc, void *argv[] ); +extern int __kmp_invoke_microtask( microtask_t pkfn, int gtid, int npr, int argc, void *argv[] +#if OMPT_SUPPORT + , void **exit_frame_ptr +#endif +); /* ------------------------------------------------------------------------ */ Index: runtime/src/kmp_atomic.h =================================================================== --- runtime/src/kmp_atomic.h +++ runtime/src/kmp_atomic.h @@ -19,6 +19,10 @@ #include "kmp_os.h" #include "kmp_lock.h" +#if OMPT_SUPPORT +#include "ompt-specific.h" +#endif + // C++ build port. // Intel compiler does not support _Complex datatype on win. // Intel compiler supports _Complex datatype on lin and mac. @@ -366,7 +370,23 @@ static inline void __kmp_acquire_atomic_lock( kmp_atomic_lock_t *lck, kmp_int32 gtid ) { +#if OMPT_SUPPORT && OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_wait_atomic)) { + ompt_callbacks.ompt_callback(ompt_event_wait_atomic)( + (ompt_wait_id_t) lck); + } +#endif + __kmp_acquire_queuing_lock( lck, gtid ); + +#if OMPT_SUPPORT && OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_acquired_atomic)) { + ompt_callbacks.ompt_callback(ompt_event_acquired_atomic)( + (ompt_wait_id_t) lck); + } +#endif } static inline int @@ -379,6 +399,13 @@ __kmp_release_atomic_lock( kmp_atomic_lock_t *lck, kmp_int32 gtid ) { __kmp_release_queuing_lock( lck, gtid ); +#if OMPT_SUPPORT && OMPT_BLAME + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_release_atomic)) { + ompt_callbacks.ompt_callback(ompt_event_release_atomic)( + (ompt_wait_id_t) lck); + } +#endif } static inline void Index: runtime/src/kmp_barrier.cpp =================================================================== --- runtime/src/kmp_barrier.cpp +++ runtime/src/kmp_barrier.cpp @@ -1034,10 +1034,37 @@ register kmp_team_t *team = this_thr->th.th_team; register int status = 0; ident_t *loc = __kmp_threads[gtid]->th.th_ident; +#if OMPT_SUPPORT + ompt_task_id_t my_task_id; + ompt_parallel_id_t my_parallel_id; +#endif KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); +#if OMPT_SUPPORT && OMPT_TRACE + if (ompt_status & ompt_status_track) { + if (ompt_status == ompt_status_track_callback) { + my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; + my_parallel_id = team->t.ompt_team_info.parallel_id; + + if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { + if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { + ompt_callbacks.ompt_callback(ompt_event_single_others_end)( + my_parallel_id, my_task_id); + } + } + this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; + if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( + my_parallel_id, my_task_id); + } + } else { + this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; + } + } +#endif + if (! team->t.t_serialized) { #if USE_ITT_BUILD // This value will be used in itt notify events below. @@ -1195,6 +1222,20 @@ } KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status)); + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { +#if OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_end)( + my_parallel_id, my_task_id); + } +#endif + this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; + } +#endif + return status; } @@ -1286,6 +1327,16 @@ KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid)); +#if OMPT_SUPPORT && OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } + this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; +#endif + if (__kmp_tasking_mode == tskm_extra_barrier) { __kmp_tasking_barrier(team, this_thr, gtid); KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid)); @@ -1401,6 +1452,22 @@ // TODO now, mark worker threads as done so they may be disbanded KMP_MB(); // Flush all pending memory write invalidates. KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); + +#if OMPT_SUPPORT + if (ompt_status == ompt_status_track) { +#if OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_end)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } +#endif + + // return to default state + this_thr->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif } Index: runtime/src/kmp_csupport.c =================================================================== --- runtime/src/kmp_csupport.c +++ runtime/src/kmp_csupport.c @@ -20,6 +20,11 @@ #include "kmp_error.h" #include "kmp_stats.h" +#if OMPT_SUPPORT +#include "ompt-internal.h" +#include "ompt-specific.h" +#endif + #define MAX_MESSAGE 512 /* ------------------------------------------------------------------------ */ @@ -283,12 +288,21 @@ va_list ap; va_start( ap, microtask ); +#if OMPT_SUPPORT + kmp_info_t *master_th = __kmp_threads[ gtid ]; + kmp_team_t *parent_team = master_th->th.th_team; + int tid = __kmp_tid_from_gtid( gtid ); + parent_team->t.t_implicit_task_taskdata[tid]. + ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(0); +#endif + #if INCLUDE_SSC_MARKS SSC_MARK_FORKING(); #endif __kmp_fork_call( loc, gtid, fork_context_intel, argc, - VOLATILE_CAST(microtask_t) microtask, + VOLATILE_CAST(void *) microtask, // "unwrapped" task + VOLATILE_CAST(microtask_t) microtask, // "wrapped" task VOLATILE_CAST(launch_t) __kmp_invoke_task_func, /* TODO: revert workaround for Intel(R) 64 tracker #96 */ #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX @@ -303,6 +317,13 @@ __kmp_join_call( loc, gtid ); va_end( ap ); + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + parent_team->t.t_implicit_task_taskdata[tid]. + ompt_task_info.frame.reenter_runtime_frame = 0; + } +#endif } KMP_START_EXPLICIT_TIMER(OMP_serial); } @@ -358,7 +379,8 @@ __kmp_fork_call( loc, gtid, fork_context_intel, argc, - VOLATILE_CAST(microtask_t) __kmp_teams_master, + VOLATILE_CAST(void *) microtask, // "unwrapped" task + VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX &ap @@ -662,6 +684,20 @@ if( KMP_MASTER_GTID( global_tid )) status = 1; +#if OMPT_SUPPORT && OMPT_TRACE + if (status) { + kmp_info_t *this_thr = __kmp_threads[ global_tid ]; + kmp_team_t *team = this_thr -> th.th_team; + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_master_begin)) { + int tid = __kmp_tid_from_gtid( global_tid ); + ompt_callbacks.ompt_callback(ompt_event_master_begin)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } + } +#endif + if ( __kmp_env_consistency_check ) { #if KMP_USE_DYNAMIC_LOCK if (status) @@ -694,6 +730,18 @@ KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid )); +#if OMPT_SUPPORT && OMPT_TRACE + kmp_info_t *this_thr = __kmp_threads[ global_tid ]; + kmp_team_t *team = this_thr -> th.th_team; + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_master_end)) { + int tid = __kmp_tid_from_gtid( global_tid ); + ompt_callbacks.ompt_callback(ompt_event_master_end)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } +#endif + if ( __kmp_env_consistency_check ) { if( global_tid < 0 ) KMP_WARNING( ThreadIdentInvalid ); @@ -729,11 +777,41 @@ th = __kmp_threads[ gtid ]; +#if OMPT_SUPPORT && OMPT_TRACE + if (ompt_status & ompt_status_track) { + /* OMPT state update */ + th->th.ompt_thread_info.wait_id = (uint64_t) loc; + th->th.ompt_thread_info.state = ompt_state_wait_ordered; + + /* OMPT event callback */ + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) { + ompt_callbacks.ompt_callback(ompt_event_wait_ordered)( + th->th.ompt_thread_info.wait_id); + } + } +#endif + if ( th -> th.th_dispatch -> th_deo_fcn != 0 ) (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc ); else __kmp_parallel_deo( & gtid, & cid, loc ); +#if OMPT_SUPPORT && OMPT_TRACE + if (ompt_status & ompt_status_track) { + /* OMPT state update */ + th->th.ompt_thread_info.state = ompt_state_work_parallel; + th->th.ompt_thread_info.wait_id = 0; + + /* OMPT event callback */ + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) { + ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)( + th->th.ompt_thread_info.wait_id); + } + } +#endif + #if USE_ITT_BUILD __kmp_itt_ordered_start( gtid ); #endif /* USE_ITT_BUILD */ @@ -765,6 +843,14 @@ (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc ); else __kmp_parallel_dxo( & gtid, & cid, loc ); + +#if OMPT_SUPPORT && OMPT_BLAME + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_release_ordered)) { + ompt_callbacks.ompt_callback(ompt_event_release_ordered)( + th->th.ompt_thread_info.wait_id); + } +#endif } #if KMP_USE_DYNAMIC_LOCK @@ -1137,6 +1223,14 @@ // Value of 'crit' should be good for using as a critical_id of the critical section directive. __kmp_release_user_lock_with_checks( lck, global_tid ); +#if OMPT_SUPPORT && OMPT_BLAME + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_release_critical)) { + ompt_callbacks.ompt_callback(ompt_event_release_critical)( + (uint64_t) lck); + } +#endif + #endif // KMP_USE_DYNAMIC_LOCK KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid )); @@ -1257,6 +1351,31 @@ { KMP_COUNT_BLOCK(OMP_SINGLE); kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE ); + +#if OMPT_SUPPORT && OMPT_TRACE + kmp_info_t *this_thr = __kmp_threads[ global_tid ]; + kmp_team_t *team = this_thr -> th.th_team; + int tid = __kmp_tid_from_gtid( global_tid ); + + if ((ompt_status == ompt_status_track_callback)) { + if (rc) { + if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) { + ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id, + team->t.ompt_team_info.microtask); + } + } else { + if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) { + ompt_callbacks.ompt_callback(ompt_event_single_others_begin)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } + this_thr->th.ompt_thread_info.state = ompt_state_wait_single; + } + } +#endif + return rc; } @@ -1273,6 +1392,19 @@ __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { __kmp_exit_single( global_tid ); + +#if OMPT_SUPPORT && OMPT_TRACE + kmp_info_t *this_thr = __kmp_threads[ global_tid ]; + kmp_team_t *team = this_thr -> th.th_team; + int tid = __kmp_tid_from_gtid( global_tid ); + + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) { + ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } +#endif } /*! @@ -1287,6 +1419,19 @@ { KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); +#if OMPT_SUPPORT && OMPT_TRACE + kmp_info_t *this_thr = __kmp_threads[ global_tid ]; + kmp_team_t *team = this_thr -> th.th_team; + int tid = __kmp_tid_from_gtid( global_tid ); + + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_loop_end)) { + ompt_callbacks.ompt_callback(ompt_event_loop_end)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } +#endif + if ( __kmp_env_consistency_check ) __kmp_pop_workshare( global_tid, ct_pdo, loc ); } @@ -1928,6 +2073,13 @@ RELEASE_LOCK( lck, gtid ); +#if OMPT_SUPPORT && OMPT_BLAME + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_release_lock)) { + ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck); + } +#endif + #endif // KMP_USE_DYNAMIC_LOCK } @@ -1980,7 +2132,20 @@ __kmp_itt_lock_releasing( lck ); #endif /* USE_ITT_BUILD */ - RELEASE_NESTED_LOCK( lck, gtid ); + int release_status = RELEASE_NESTED_LOCK( lck, gtid ); +#if OMPT_SUPPORT && OMPT_BLAME + if (ompt_status == ompt_status_track_callback) { + if (release_status == KMP_LOCK_RELEASED) { + if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) { + ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)( + (uint64_t) lck); + } + } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) { + ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)( + (uint64_t) lck); + } + } +#endif #endif // KMP_USE_DYNAMIC_LOCK } Index: runtime/src/kmp_dispatch.cpp =================================================================== --- runtime/src/kmp_dispatch.cpp +++ runtime/src/kmp_dispatch.cpp @@ -35,6 +35,11 @@ #include #endif +#if OMPT_SUPPORT +#include "ompt-internal.h" +#include "ompt-specific.h" +#endif + /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ @@ -1189,6 +1194,16 @@ } } #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) + +#if OMPT_SUPPORT && OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + ompt_task_info_t *task_info = __ompt_get_taskinfo(0); + ompt_callbacks.ompt_callback(ompt_event_loop_begin)( + team_info->parallel_id, task_info->task_id, team_info->microtask); + } +#endif } /* @@ -1339,6 +1354,24 @@ #endif /* KMP_GOMP_COMPAT */ +/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 + * (no more work), then tell OMPT the loop is over. In some cases + * kmp_dispatch_fini() is not called. */ +#if OMPT_SUPPORT && OMPT_TRACE +#define OMPT_LOOP_END \ + if (status == 0) { \ + if ((ompt_status == ompt_status_track_callback) && \ + ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ + ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ + ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ + team_info->parallel_id, task_info->task_id); \ + } \ + } +#else +#define OMPT_LOOP_END // no-op +#endif + template< typename T > static int __kmp_dispatch_next( @@ -1476,6 +1509,7 @@ #if INCLUDE_SSC_MARKS SSC_MARK_DISPATCH_NEXT(); #endif + OMPT_LOOP_END; return status; } else { kmp_int32 last = 0; @@ -2115,6 +2149,7 @@ #if INCLUDE_SSC_MARKS SSC_MARK_DISPATCH_NEXT(); #endif + OMPT_LOOP_END; return status; } Index: runtime/src/kmp_gsupport.c =================================================================== --- runtime/src/kmp_gsupport.c +++ runtime/src/kmp_gsupport.c @@ -19,6 +19,10 @@ #include "kmp.h" #include "kmp_atomic.h" +#if OMPT_SUPPORT +#include "ompt-specific.h" +#endif + #ifdef __cplusplus extern "C" { #endif // __cplusplus @@ -106,6 +110,11 @@ { int gtid = __kmp_entry_gtid(); KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid)); + +#if OMPT_SUPPORT + __ompt_thread_assign_wait_id(0); +#endif + __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid); } @@ -246,7 +255,36 @@ __kmp_GOMP_microtask_wrapper(int *gtid, int *npr, void (*task)(void *), void *data) { +#if OMPT_SUPPORT + kmp_info_t *thr; + ompt_frame_t *ompt_frame; + ompt_state_t enclosing_state; + + if (ompt_status & ompt_status_track) { + // get pointer to thread data structure + thr = __kmp_threads[*gtid]; + + // save enclosing task state; set current state for task + enclosing_state = thr->th.ompt_thread_info.state; + thr->th.ompt_thread_info.state = ompt_state_work_parallel; + + // set task frame + ompt_frame = __ompt_get_task_frame_internal(0); + ompt_frame->exit_runtime_frame = __builtin_frame_address(0); + } +#endif + task(data); + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + // clear task frame + ompt_frame->exit_runtime_frame = NULL; + + // restore enclosing state + thr->th.ompt_thread_info.state = enclosing_state; + } +#endif } @@ -264,10 +302,37 @@ KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size, schedule != kmp_sch_static); +#if OMPT_SUPPORT + kmp_info_t *thr; + ompt_frame_t *ompt_frame; + ompt_state_t enclosing_state; + + if (ompt_status & ompt_status_track) { + thr = __kmp_threads[*gtid]; + // save enclosing task state; set current state for task + enclosing_state = thr->th.ompt_thread_info.state; + thr->th.ompt_thread_info.state = ompt_state_work_parallel; + + // set task frame + ompt_frame = __ompt_get_task_frame_internal(0); + ompt_frame->exit_runtime_frame = __builtin_frame_address(0); + } +#endif + // // Now invoke the microtask. // task(data); + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + // clear task frame + ompt_frame->exit_runtime_frame = NULL; + + // reset enclosing state + thr->th.ompt_thread_info.state = enclosing_state; + } +#endif } @@ -275,14 +340,22 @@ static #endif /* KMP_DEBUG */ void -__kmp_GOMP_fork_call(ident_t *loc, int gtid, microtask_t wrapper, int argc,...) +__kmp_GOMP_fork_call(ident_t *loc, int gtid, void (*unwrapped_task)(void *), microtask_t wrapper, int argc,...) { int rc; + kmp_info_t *thr = __kmp_threads[gtid]; + kmp_team_t *team = thr->th.th_team; + int tid = __kmp_tid_from_gtid(gtid); va_list ap; va_start(ap, argc); - rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, wrapper, __kmp_invoke_task_func, +#if OMPT_SUPPORT + team->t.t_implicit_task_taskdata[tid]. + ompt_task_info.frame.reenter_runtime_frame = NULL; +#endif + + rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, VOLATILE_CAST(void *) unwrapped_task, wrapper, __kmp_invoke_task_func, #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX &ap #else @@ -293,10 +366,69 @@ va_end(ap); if (rc) { + __kmp_run_before_invoked_task(gtid, tid, thr, team); + } + +#if OMPT_SUPPORT && OMPT_TRACE + if (ompt_status & ompt_status_track) { + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + ompt_task_info_t *task_info = __ompt_get_taskinfo(0); + + // implicit task callback + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)( + team_info->parallel_id, task_info->task_id); + } + thr->th.ompt_thread_info.state = ompt_state_work_parallel; + } +#endif +} + +static void +__kmp_GOMP_serialized_parallel(ident_t *loc, kmp_int32 gtid, void (*task)(void *)) +{ + __kmp_serialized_parallel(loc, gtid); + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + ompt_task_id_t ompt_task_id = __ompt_get_task_id_internal(0); + ompt_frame_t *ompt_frame = __ompt_get_task_frame_internal(0); kmp_info_t *thr = __kmp_threads[gtid]; - __kmp_run_before_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr, - thr->th.th_team); + + ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(gtid); + ompt_task_id_t my_ompt_task_id = __ompt_task_id_new(gtid); + + ompt_frame->exit_runtime_frame = NULL; + + // parallel region callback + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) { + int team_size = 1; + ompt_callbacks.ompt_callback(ompt_event_parallel_begin)( + ompt_task_id, ompt_frame, ompt_parallel_id, + team_size, (void *) task); + } + + // set up lightweight task + ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *) + __kmp_allocate(sizeof(ompt_lw_taskteam_t)); + __ompt_lw_taskteam_init(lwt, thr, gtid, (void *) task, ompt_parallel_id); + lwt->ompt_task_info.task_id = my_ompt_task_id; + lwt->ompt_task_info.frame.exit_runtime_frame = 0; + __ompt_lw_taskteam_link(lwt, thr); + +#if OMPT_TRACE + // implicit task callback + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)( + ompt_parallel_id, my_ompt_task_id); + } + thr->th.ompt_thread_info.state = ompt_state_work_parallel; +#endif } +#endif } @@ -304,6 +436,16 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *), void *data, unsigned num_threads) { int gtid = __kmp_entry_gtid(); + +#if OMPT_SUPPORT + ompt_frame_t *parent_frame; + + if (ompt_status & ompt_status_track) { + parent_frame = __ompt_get_task_frame_internal(0); + parent_frame->reenter_runtime_frame = __builtin_frame_address(0); + } +#endif + MKLOC(loc, "GOMP_parallel_start"); KA_TRACE(20, ("GOMP_parallel_start: T#%d\n", gtid)); @@ -311,12 +453,18 @@ if (num_threads != 0) { __kmp_push_num_threads(&loc, gtid, num_threads); } - __kmp_GOMP_fork_call(&loc, gtid, + __kmp_GOMP_fork_call(&loc, gtid, task, (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task, data); } else { - __kmpc_serialized_parallel(&loc, gtid); + __kmp_GOMP_serialized_parallel(&loc, gtid, task); } + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + parent_frame->reenter_runtime_frame = NULL; + } +#endif } @@ -324,9 +472,39 @@ xexpand(KMP_API_NAME_GOMP_PARALLEL_END)(void) { int gtid = __kmp_get_gtid(); + kmp_info_t *thr = __kmp_threads[gtid]; + MKLOC(loc, "GOMP_parallel_end"); KA_TRACE(20, ("GOMP_parallel_end: T#%d\n", gtid)); + +#if OMPT_SUPPORT + ompt_parallel_id_t parallel_id; + ompt_frame_t *ompt_frame = NULL; + + if (ompt_status & ompt_status_track) { + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + parallel_id = team_info->parallel_id; + + ompt_frame = __ompt_get_task_frame_internal(0); + ompt_frame->exit_runtime_frame = __builtin_frame_address(0); + +#if OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) { + ompt_task_info_t *task_info = __ompt_get_taskinfo(0); + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)( + parallel_id, task_info->task_id); + } +#endif + + // unlink if necessary. no-op if there is not a lightweight task. + ompt_lw_taskteam_t *lwt = __ompt_lw_taskteam_unlink(thr); + // GOMP allocates/frees lwt since it can't be kept on the stack + if (lwt) __kmp_free(lwt); + } +#endif + if (! __kmp_threads[gtid]->th.th_team->t.t_serialized) { kmp_info_t *thr = __kmp_threads[gtid]; __kmp_run_after_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr, @@ -335,6 +513,22 @@ } else { __kmpc_end_serialized_parallel(&loc, gtid); + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_parallel_end)) { + ompt_task_info_t *task_info = __ompt_get_taskinfo(0); + ompt_callbacks.ompt_callback(ompt_event_parallel_end)( + parallel_id, task_info->task_id); + } + + thr->th.ompt_thread_info.state = + (((thr->th.th_team)->t.t_serialized) ? + ompt_state_work_serial : ompt_state_work_parallel); + } +#endif + } } @@ -635,13 +829,13 @@ if (num_threads != 0) { \ __kmp_push_num_threads(&loc, gtid, num_threads); \ } \ - __kmp_GOMP_fork_call(&loc, gtid, \ + __kmp_GOMP_fork_call(&loc, gtid, task, \ (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, \ task, data, num_threads, &loc, (schedule), lb, \ (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz); \ } \ else { \ - __kmpc_serialized_parallel(&loc, gtid); \ + __kmp_GOMP_serialized_parallel(&loc, gtid, task); \ } \ \ KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb, \ @@ -707,9 +901,32 @@ __kmpc_omp_task(&loc, gtid, task); } else { +#if OMPT_SUPPORT + ompt_thread_info_t oldInfo; + kmp_info_t *thread; + kmp_taskdata_t *taskdata; + if (ompt_status & ompt_status_track) { + // Store the threads states and restore them after the task + thread = __kmp_threads[ gtid ]; + taskdata = KMP_TASK_TO_TASKDATA(task); + oldInfo = thread->th.ompt_thread_info; + thread->th.ompt_thread_info.wait_id = 0; + thread->th.ompt_thread_info.state = ompt_state_work_parallel; + taskdata->ompt_task_info.frame.exit_runtime_frame = + __builtin_frame_address(0); + } +#endif + __kmpc_omp_task_begin_if0(&loc, gtid, task); func(data); __kmpc_omp_task_complete_if0(&loc, gtid, task); + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + thread->th.ompt_thread_info = oldInfo; + taskdata->ompt_task_info.frame.exit_runtime_frame = 0; + } +#endif } KA_TRACE(20, ("GOMP_task exit: T#%d\n", gtid)); @@ -801,6 +1018,16 @@ { int gtid = __kmp_entry_gtid(); int last = FALSE; + +#if OMPT_SUPPORT + ompt_frame_t *parent_frame; + + if (ompt_status & ompt_status_track) { + parent_frame = __ompt_get_task_frame_internal(0); + parent_frame->reenter_runtime_frame = __builtin_frame_address(0); + } +#endif + MKLOC(loc, "GOMP_parallel_sections_start"); KA_TRACE(20, ("GOMP_parallel_sections_start: T#%d\n", gtid)); @@ -808,14 +1035,20 @@ if (num_threads != 0) { __kmp_push_num_threads(&loc, gtid, num_threads); } - __kmp_GOMP_fork_call(&loc, gtid, + __kmp_GOMP_fork_call(&loc, gtid, task, (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, task, data, num_threads, &loc, kmp_nm_dynamic_chunked, (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1); } else { - __kmpc_serialized_parallel(&loc, gtid); + __kmp_GOMP_serialized_parallel(&loc, gtid, task); + } + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + parent_frame->reenter_runtime_frame = NULL; } +#endif KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE); @@ -865,11 +1098,11 @@ if(flags != 0) { __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags); } - __kmp_GOMP_fork_call(&loc, gtid, + __kmp_GOMP_fork_call(&loc, gtid, task, (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task, data); } else { - __kmpc_serialized_parallel(&loc, gtid); + __kmp_GOMP_serialized_parallel(&loc, gtid, task); } task(data); xexpand(KMP_API_NAME_GOMP_PARALLEL_END)(); @@ -891,13 +1124,13 @@ if(flags != 0) { __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags); } - __kmp_GOMP_fork_call(&loc, gtid, + __kmp_GOMP_fork_call(&loc, gtid, task, (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, task, data, num_threads, &loc, kmp_nm_dynamic_chunked, (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1); } else { - __kmpc_serialized_parallel(&loc, gtid); + __kmp_GOMP_serialized_parallel(&loc, gtid, task); } KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE); @@ -924,13 +1157,13 @@ if (flags != 0) { \ __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags); \ } \ - __kmp_GOMP_fork_call(&loc, gtid, \ + __kmp_GOMP_fork_call(&loc, gtid, task, \ (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, \ task, data, num_threads, &loc, (schedule), lb, \ (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz); \ } \ else { \ - __kmpc_serialized_parallel(&loc, gtid); \ + __kmp_GOMP_serialized_parallel(&loc, gtid, task); \ } \ \ KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb, \ Index: runtime/src/kmp_lock.cpp =================================================================== --- runtime/src/kmp_lock.cpp +++ runtime/src/kmp_lock.cpp @@ -1206,6 +1206,10 @@ volatile kmp_uint32 *spin_here_p; kmp_int32 need_mf = 1; +#if OMPT_SUPPORT + ompt_state_t prev_state = ompt_state_undefined; +#endif + KA_TRACE( 1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid )); KMP_FSYNC_PREPARE( lck ); @@ -1309,6 +1313,16 @@ #ifdef DEBUG_QUEUING_LOCKS TRACE_LOCK_HT( gtid+1, "acq exit: ", head, 0 ); #endif + +#if OMPT_SUPPORT + if ((ompt_status & ompt_status_track) && + prev_state != ompt_state_undefined) { + /* change the state before clearing wait_id */ + this_thr->th.ompt_thread_info.state = prev_state; + this_thr->th.ompt_thread_info.wait_id = 0; + } +#endif + KMP_FSYNC_ACQUIRED( lck ); return; /* lock holder cannot be on queue */ } @@ -1317,6 +1331,16 @@ break; } +#if OMPT_SUPPORT + if ((ompt_status & ompt_status_track) && + prev_state == ompt_state_undefined) { + /* this thread will spin; set wait_id before entering wait state */ + prev_state = this_thr->th.ompt_thread_info.state; + this_thr->th.ompt_thread_info.wait_id = (uint64_t) lck; + this_thr->th.ompt_thread_info.state = ompt_state_wait_lock; + } +#endif + if ( enqueued ) { if ( tail > 0 ) { kmp_info_t *tail_thr = __kmp_thread_from_gtid( tail - 1 ); @@ -1346,6 +1370,13 @@ #ifdef DEBUG_QUEUING_LOCKS TRACE_LOCK( gtid+1, "acq exit 2" ); #endif + +#if OMPT_SUPPORT + /* change the state before clearing wait_id */ + this_thr->th.ompt_thread_info.state = prev_state; + this_thr->th.ompt_thread_info.wait_id = 0; +#endif + /* got lock, we were dequeued by the thread that released lock */ return; } @@ -1491,6 +1522,11 @@ #ifdef DEBUG_QUEUING_LOCKS TRACE_LOCK_HT( gtid+1, "rel exit: ", 0, 0 ); #endif + +#if OMPT_SUPPORT + /* nothing to do - no other thread is trying to shift blame */ +#endif + return KMP_LOCK_RELEASED; } dequeued = FALSE; Index: runtime/src/kmp_runtime.c =================================================================== --- runtime/src/kmp_runtime.c +++ runtime/src/kmp_runtime.c @@ -26,6 +26,10 @@ #include "kmp_stats.h" #include "kmp_wait_release.h" +#if OMPT_SUPPORT +#include "ompt-specific.h" +#endif + /* these are temporary issues to be dealt with */ #define KMP_USE_PRCTL 0 #define KMP_USE_POOLED_ALLOC 0 @@ -759,6 +763,16 @@ /* TODO repleace with general release procedure */ team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc ); +#if OMPT_SUPPORT && OMPT_BLAME + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_release_ordered)) { + /* accept blame for "ordered" waiting */ + kmp_info_t *this_thread = __kmp_threads[gtid]; + ompt_callbacks.ompt_callback(ompt_event_release_ordered)( + this_thread->th.ompt_thread_info.wait_id); + } +#endif + KMP_MB(); /* Flush all pending memory write invalidates. */ } #endif /* BUILD_PARALLEL_ORDERED */ @@ -1271,7 +1285,14 @@ __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock ); +#if OMPT_SUPPORT + ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid); +#endif + new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1, +#if OMPT_SUPPORT + ompt_parallel_id, +#endif #if OMP_40_ENABLED proc_bind, #endif @@ -1355,6 +1376,11 @@ } this_thr->th.th_dispatch = serial_team->t.t_dispatch; +#if OMPT_SUPPORT + ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid); + __ompt_team_assign_id(serial_team, ompt_parallel_id); +#endif + KMP_MB(); } else { @@ -1422,6 +1448,7 @@ int gtid, enum fork_context_e call_context, // Intel, GNU, ... kmp_int32 argc, + void *unwrapped_task, microtask_t microtask, launch_t invoker, /* TODO: revert workaround for Intel(R) 64 tracker #96 */ @@ -1477,6 +1504,21 @@ root = master_th->th.th_root; master_active = root->r.r_active; master_set_numthreads = master_th->th.th_set_nproc; + +#if OMPT_SUPPORT + ompt_parallel_id_t ompt_parallel_id; + ompt_task_id_t ompt_task_id; + ompt_frame_t *ompt_frame; + ompt_task_id_t my_task_id; + ompt_parallel_id_t my_parallel_id; + + if (ompt_status & ompt_status_track) { + ompt_parallel_id = __ompt_parallel_id_new(gtid); + ompt_task_id = __ompt_get_task_id_internal(0); + ompt_frame = __ompt_get_task_frame_internal(0); + } +#endif + // Nested level will be an index in the nested nthreads array level = parent_team->t.t_level; #if OMP_40_ENABLED @@ -1493,6 +1535,16 @@ } #endif +#if OMPT_SUPPORT + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) { + int team_size = master_set_numthreads; + + ompt_callbacks.ompt_callback(ompt_event_parallel_begin)( + ompt_task_id, ompt_frame, ompt_parallel_id, + team_size, unwrapped_task); + } +#endif master_th->th.th_ident = loc; @@ -1519,11 +1571,77 @@ KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 ); parent_team->t.t_serialized--; // AC: need this in order enquiry functions // work correctly, will restore at join time + +#if OMPT_SUPPORT + void *dummy; + void **exit_runtime_p; + + ompt_lw_taskteam_t lw_taskteam; + + if (ompt_status & ompt_status_track) { + __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, + unwrapped_task, ompt_parallel_id); + lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid); + exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame); + + __ompt_lw_taskteam_link(&lw_taskteam, master_th); + +#if OMPT_TRACE + /* OMPT implicit task begin */ + my_task_id = lw_taskteam.ompt_task_info.task_id; + my_parallel_id = parent_team->t.ompt_team_info.parallel_id; + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)( + my_parallel_id, my_task_id); + } +#endif + + /* OMPT state */ + master_th->th.ompt_thread_info.state = ompt_state_work_parallel; + } else { + exit_runtime_p = &dummy; + } +#endif + KMP_TIME_BLOCK(OMP_work); - __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv ); + __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv +#if OMPT_SUPPORT + , exit_runtime_p +#endif + ); + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { +#if OMPT_TRACE + lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0; + + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)( + ompt_parallel_id, ompt_task_id); + } + + __ompt_lw_taskteam_unlink(master_th); + // reset clear the task id only after unlinking the task + lw_taskteam.ompt_task_info.task_id = ompt_task_id_none; +#endif + + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_parallel_end)) { + ompt_callbacks.ompt_callback(ompt_event_parallel_end)( + ompt_parallel_id, ompt_task_id); + } + master_th->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif return TRUE; } + parent_team->t.t_pkfn = microtask; +#if OMPT_SUPPORT + parent_team->t.ompt_team_info.microtask = unwrapped_task; +#endif parent_team->t.t_invoke = invoker; KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel ); parent_team->t.t_active_level ++; @@ -1620,10 +1738,70 @@ // revert change made in __kmpc_serialized_parallel() master_th->th.th_serial_team->t.t_level--; // Get args from parent team for teams construct + +#if OMPT_SUPPORT + void *dummy; + void **exit_runtime_p; + + ompt_lw_taskteam_t lw_taskteam; + + if (ompt_status & ompt_status_track) { + __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, + unwrapped_task, ompt_parallel_id); + lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid); + exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame); + + __ompt_lw_taskteam_link(&lw_taskteam, master_th); + +#if OMPT_TRACE + my_task_id = lw_taskteam.ompt_task_info.task_id; + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)( + ompt_parallel_id, my_task_id); + } +#endif + + /* OMPT state */ + master_th->th.ompt_thread_info.state = ompt_state_work_parallel; + } else { + exit_runtime_p = &dummy; + } +#endif + { KMP_TIME_BLOCK(OMP_work); - __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv ); + __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv +#if OMPT_SUPPORT + , exit_runtime_p +#endif + ); } + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0; + +#if OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)( + ompt_parallel_id, ompt_task_id); + } +#endif + + __ompt_lw_taskteam_unlink(master_th); + // reset clear the task id only after unlinking the task + lw_taskteam.ompt_task_info.task_id = ompt_task_id_none; + + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_parallel_end)) { + ompt_callbacks.ompt_callback(ompt_event_parallel_end)( + ompt_parallel_id, ompt_task_id); + } + master_th->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif } else if ( microtask == (microtask_t)__kmp_teams_master ) { KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team ); team = master_th->th.th_team; @@ -1664,15 +1842,88 @@ *argv++ = va_arg( ap, void * ); #endif KMP_MB(); + +#if OMPT_SUPPORT + void *dummy; + void **exit_runtime_p; + + ompt_lw_taskteam_t lw_taskteam; + + if (ompt_status & ompt_status_track) { + __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, + unwrapped_task, ompt_parallel_id); + lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid); + exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame); + + __ompt_lw_taskteam_link(&lw_taskteam, master_th); + +#if OMPT_TRACE + /* OMPT implicit task begin */ + my_task_id = lw_taskteam.ompt_task_info.task_id; + my_parallel_id = ompt_parallel_id; + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)( + my_parallel_id, my_task_id); + } +#endif + + /* OMPT state */ + master_th->th.ompt_thread_info.state = ompt_state_work_parallel; + } else { + exit_runtime_p = &dummy; + } +#endif + { KMP_TIME_BLOCK(OMP_work); - __kmp_invoke_microtask( microtask, gtid, 0, argc, args ); + __kmp_invoke_microtask( microtask, gtid, 0, argc, args +#if OMPT_SUPPORT + , exit_runtime_p +#endif + ); } + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { +#if OMPT_TRACE + lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0; + + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)( + my_parallel_id, my_task_id); + } +#endif + + __ompt_lw_taskteam_unlink(master_th); + // reset clear the task id only after unlinking the task + lw_taskteam.ompt_task_info.task_id = ompt_task_id_none; + + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_parallel_end)) { + ompt_callbacks.ompt_callback(ompt_event_parallel_end)( + ompt_parallel_id, ompt_task_id); + } + master_th->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif #if OMP_40_ENABLED } #endif /* OMP_40_ENABLED */ } else if ( call_context == fork_context_gnu ) { +#if OMPT_SUPPORT + ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *) + __kmp_allocate(sizeof(ompt_lw_taskteam_t)); + __ompt_lw_taskteam_init(lwt, master_th, gtid, + unwrapped_task, ompt_parallel_id); + + lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid); + lwt->ompt_task_info.frame.exit_runtime_frame = 0; + __ompt_lw_taskteam_link(lwt, master_th); +#endif + // we were called from GNU native code KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid )); return FALSE; @@ -1759,6 +2010,9 @@ /* allocate a new parallel team */ KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) ); team = __kmp_allocate_team(root, nthreads, nthreads, +#if OMPT_SUPPORT + ompt_parallel_id, +#endif #if OMP_40_ENABLED proc_bind, #endif @@ -1767,6 +2021,9 @@ /* allocate a new parallel team */ KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) ); team = __kmp_allocate_team(root, nthreads, nthreads, +#if OMPT_SUPPORT + ompt_parallel_id, +#endif #if OMP_40_ENABLED proc_bind, #endif @@ -1781,6 +2038,9 @@ team->t.t_ident = loc; team->t.t_parent = parent_team; TCW_SYNC_PTR(team->t.t_pkfn, microtask); +#if OMPT_SUPPORT + TCW_SYNC_PTR(team->t.ompt_team_info.microtask, unwrapped_task); +#endif team->t.t_invoke = invoker; /* TODO move this to root, maybe */ // TODO: parent_team->t.t_level == INT_MAX ??? #if OMP_40_ENABLED @@ -1867,6 +2127,9 @@ __kmp_fork_team_threads( root, team, master_th, gtid ); __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc ); +#if OMPT_SUPPORT + master_th->th.ompt_thread_info.state = ompt_state_work_parallel; +#endif __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock ); @@ -1948,9 +2211,42 @@ KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid )); +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + master_th->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif + return TRUE; } +#if OMPT_SUPPORT +static inline void +__kmp_join_restore_state( + kmp_info_t *thread, + kmp_team_t *team) +{ + // restore state outside the region + thread->th.ompt_thread_info.state = ((team->t.t_serialized) ? + ompt_state_work_serial : ompt_state_work_parallel); +} + +static inline void +__kmp_join_ompt( + kmp_info_t *thread, + kmp_team_t *team, + ompt_parallel_id_t parallel_id) +{ + if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) { + ompt_task_info_t *task_info = __ompt_get_taskinfo(0); + ompt_callbacks.ompt_callback(ompt_event_parallel_end)( + parallel_id, task_info->task_id); + } + + __kmp_join_restore_state(thread,team); +} +#endif + void __kmp_join_call(ident_t *loc, int gtid #if OMP_40_ENABLED @@ -1976,6 +2272,12 @@ master_th->th.th_ident = loc; +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + master_th->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif + #if KMP_DEBUG if ( __kmp_tasking_mode != tskm_immediate_exec ) { KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n", @@ -2003,6 +2305,13 @@ } #endif /* OMP_40_ENABLED */ __kmpc_end_serialized_parallel( loc, gtid ); + +#if OMPT_SUPPORT + if (ompt_status == ompt_status_track_callback) { + __kmp_join_restore_state(master_th, parent_team); + } +#endif + return; } @@ -2022,6 +2331,10 @@ KMP_MB(); +#if OMPT_SUPPORT + ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id; +#endif + #if USE_ITT_BUILD if ( __itt_stack_caller_create_ptr ) { __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier @@ -2097,6 +2410,13 @@ } } } + +#if OMPT_SUPPORT + if (ompt_status == ompt_status_track_callback) { + __kmp_join_ompt(master_th, parent_team, parallel_id); + } +#endif + return; } #endif /* OMP_40_ENABLED */ @@ -2182,6 +2502,12 @@ __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock ); +#if OMPT_SUPPORT + if (ompt_status == ompt_status_track_callback) { + __kmp_join_ompt(master_th, parent_team, parallel_id); + } +#endif + KMP_MB(); KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid )); } @@ -2814,11 +3140,15 @@ /* setup the root team for this task */ /* allocate the root team structure */ KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) ); + root_team = __kmp_allocate_team( root, 1, // new_nproc 1, // max_nproc +#if OMPT_SUPPORT + 0, // root parallel id +#endif #if OMP_40_ENABLED __kmp_nested_proc_bind.bind_types[0], #endif @@ -2845,11 +3175,15 @@ /* setup the hot team for this task */ /* allocate the hot team structure */ KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) ); + hot_team = __kmp_allocate_team( root, 1, // new_nproc __kmp_dflt_team_nth_ub * 2, // max_nproc +#if OMPT_SUPPORT + 0, // root parallel id +#endif #if OMP_40_ENABLED __kmp_nested_proc_bind.bind_types[0], #endif @@ -3425,7 +3759,11 @@ if( ! root_thread->th.th_serial_team ) { kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) ); + root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1, +#if OMPT_SUPPORT + 0, // root parallel id +#endif #if OMP_40_ENABLED proc_bind_default, #endif @@ -3563,6 +3901,14 @@ __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread ); #endif /* KMP_OS_WINDOWS */ +#if OMPT_SUPPORT + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_thread_end)) { + int gtid = __kmp_get_gtid(); + __ompt_thread_end(ompt_thread_initial, gtid); + } +#endif + TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. __kmp_reap_thread( root->r.r_uber_thread, 1 ); @@ -3894,8 +4240,12 @@ { kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team ); KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) ); + new_thr->th.th_serial_team = serial_team = (kmp_team_t*) __kmp_allocate_team( root, 1, 1, +#if OMPT_SUPPORT + 0, // root parallel id +#endif #if OMP_40_ENABLED proc_bind_default, #endif @@ -4395,6 +4745,9 @@ /* allocate a new team data structure to use. take one off of the free pool if available */ kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, +#if OMPT_SUPPORT + ompt_parallel_id_t ompt_parallel_id, +#endif #if OMP_40_ENABLED kmp_proc_bind_t new_proc_bind, #endif @@ -4764,6 +5117,10 @@ } #endif +#if OMPT_SUPPORT + __ompt_team_assign_id(team, ompt_parallel_id); +#endif + KMP_MB(); return team; @@ -4804,6 +5161,11 @@ #endif KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id )); + +#if OMPT_SUPPORT + __ompt_team_assign_id(team, ompt_parallel_id); +#endif + KMP_MB(); return team; @@ -4856,6 +5218,11 @@ team->t.t_proc_bind = new_proc_bind; #endif +#if OMPT_SUPPORT + __ompt_team_assign_id(team, ompt_parallel_id); + team->t.ompt_serialized_team_info = NULL; +#endif + KMP_MB(); KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id )); @@ -5101,6 +5468,18 @@ this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak? } +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + this_thr->th.ompt_thread_info.state = ompt_state_overhead; + this_thr->th.ompt_thread_info.wait_id = 0; + this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0); + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_thread_begin)) { + __ompt_thread_begin(ompt_thread_worker, gtid); + } + } +#endif + /* This is the place where threads wait for work */ while( ! TCR_4(__kmp_global.g.g_done) ) { KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] ); @@ -5109,9 +5488,21 @@ /* wait for work to do */ KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid )); +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + this_thr->th.ompt_thread_info.state = ompt_state_idle; + } +#endif + /* No tid yet since not part of a team */ __kmp_fork_barrier( gtid, KMP_GTID_DNE ); +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + this_thr->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif + pteam = (kmp_team_t *(*))(& this_thr->th.th_team); /* have we been allocated? */ @@ -5124,6 +5515,12 @@ updateHWFPControl (*pteam); +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; + } +#endif + KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop); { KMP_TIME_BLOCK(USER_worker_invoke); @@ -5132,6 +5529,15 @@ KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop); KMP_ASSERT( rc ); +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + /* no frame set while outside task */ + int tid = __kmp_tid_from_gtid(gtid); + (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0; + + this_thr->th.ompt_thread_info.state = ompt_state_overhead; + } +#endif KMP_MB(); KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn)); @@ -5142,6 +5548,13 @@ } TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); +#if OMPT_SUPPORT + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_thread_end)) { + __ompt_thread_end(ompt_thread_worker, gtid); + } +#endif + if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) { __kmp_unref_task_team( this_thr->th.th_task_team, this_thr ); } @@ -5480,6 +5893,9 @@ __kmp_cleanup(); +#if OMPT_SUPPORT + ompt_fini(); +#endif } void @@ -6140,6 +6556,9 @@ KMP_MB(); KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) ); +#if OMPT_SUPPORT + ompt_init(); +#endif } void @@ -6284,6 +6703,9 @@ } __kmp_do_middle_initialize(); __kmp_release_bootstrap_lock( &__kmp_initz_lock ); +#if OMPT_SUPPORT + ompt_init(); +#endif } void @@ -6353,6 +6775,9 @@ KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) ); __kmp_release_bootstrap_lock( &__kmp_initz_lock ); +#if OMPT_SUPPORT + ompt_init(); +#endif } @@ -6409,8 +6834,49 @@ #if INCLUDE_SSC_MARKS SSC_MARK_INVOKING(); #endif + +#if OMPT_SUPPORT + void *dummy; + void **exit_runtime_p; + ompt_task_id_t my_task_id; + ompt_parallel_id_t my_parallel_id; + + if (ompt_status & ompt_status_track) { + exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid]. + ompt_task_info.frame.exit_runtime_frame); + } else { + exit_runtime_p = &dummy; + } + +#if OMPT_TRACE + my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; + my_parallel_id = team->t.ompt_team_info.parallel_id; + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)( + my_parallel_id, my_task_id); + } +#endif +#endif + rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn), - gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv ); + gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv +#if OMPT_SUPPORT + , exit_runtime_p +#endif + ); + +#if OMPT_SUPPORT && OMPT_TRACE + if (ompt_status & ompt_status_track) { + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) { + ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)( + my_parallel_id, my_task_id); + } + // the implicit task is not dead yet, so we can't clear its task id here + team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0; + } +#endif #if USE_ITT_BUILD if ( __itt_stack_caller_create_ptr ) { @@ -6442,7 +6908,8 @@ #endif __kmp_fork_call( loc, gtid, fork_context_intel, team->t.t_argc, - (microtask_t)thr->th.th_teams_microtask, + (void *)thr->th.th_teams_microtask, // "unwrapped" task + (microtask_t)thr->th.th_teams_microtask, // "wrapped" task VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL ); #if INCLUDE_SSC_MARKS Index: runtime/src/kmp_sched.cpp =================================================================== --- runtime/src/kmp_sched.cpp +++ runtime/src/kmp_sched.cpp @@ -29,6 +29,10 @@ #include "kmp_stats.h" #include "kmp_itt.h" +#if OMPT_SUPPORT +#include "ompt-specific.h" +#endif + // template for type limits template< typename T > struct i_maxmin { @@ -89,6 +93,11 @@ register UT trip_count; register kmp_team_t *team; +#if OMPT_SUPPORT && OMPT_TRACE + ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); + ompt_task_info_t *task_info = __ompt_get_taskinfo(0); +#endif + KMP_DEBUG_ASSERT( plastiter && plower && pupper && pstride ); KE_TRACE( 10, ("__kmpc_for_static_init called (%d)\n", global_tid)); #ifdef KMP_DEBUG @@ -132,6 +141,15 @@ } #endif KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) ); + +#if OMPT_SUPPORT && OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { + ompt_callbacks.ompt_callback(ompt_event_loop_begin)( + team_info->parallel_id, task_info->task_id, + team_info->microtask); + } +#endif return; } @@ -168,6 +186,15 @@ } #endif KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) ); + +#if OMPT_SUPPORT && OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { + ompt_callbacks.ompt_callback(ompt_event_loop_begin)( + team_info->parallel_id, task_info->task_id, + team_info->microtask); + } +#endif return; } nth = team->t.t_nproc; @@ -187,6 +214,15 @@ } #endif KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) ); + +#if OMPT_SUPPORT && OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { + ompt_callbacks.ompt_callback(ompt_event_loop_begin)( + team_info->parallel_id, task_info->task_id, + team_info->microtask); + } +#endif return; } @@ -304,6 +340,15 @@ } #endif KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) ); + +#if OMPT_SUPPORT && OMPT_TRACE + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { + ompt_callbacks.ompt_callback(ompt_event_loop_begin)( + team_info->parallel_id, task_info->task_id, team_info->microtask); + } +#endif + return; } Index: runtime/src/kmp_tasking.c =================================================================== --- runtime/src/kmp_tasking.c +++ runtime/src/kmp_tasking.c @@ -434,6 +434,18 @@ KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata ) ); +#if OMPT_SUPPORT + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_task_begin)) { + kmp_taskdata_t *parent = taskdata->td_parent; + ompt_callbacks.ompt_callback(ompt_event_task_begin)( + parent ? parent->ompt_task_info.task_id : ompt_task_id_none, + parent ? &(parent->ompt_task_info.frame) : NULL, + taskdata->ompt_task_info.task_id, + taskdata->ompt_task_info.function); + } +#endif + return; } @@ -579,6 +591,15 @@ kmp_info_t * thread = __kmp_threads[ gtid ]; kmp_int32 children = 0; +#if OMPT_SUPPORT + if ((ompt_status == ompt_status_track_callback) && + ompt_callbacks.ompt_callback(ompt_event_task_end)) { + kmp_taskdata_t *parent = taskdata->td_parent; + ompt_callbacks.ompt_callback(ompt_event_task_end)( + taskdata->ompt_task_info.task_id); + } +#endif + KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming task %p\n", gtid, taskdata, resumed_task) ); @@ -654,6 +675,9 @@ // Free this task and then ancestor tasks if they have no children. __kmp_free_task_and_ancestors(gtid, taskdata, thread); + // FIXME johnmc: I this statement should be before the last one so if an + // asynchronous inquiry peers into the runtime system it doesn't see the freed + // task as the current task __kmp_threads[ gtid ] -> th.th_current_task = resumed_task; // restore current_task // TODO: GEH - make sure root team implicit task is initialized properly. @@ -783,6 +807,10 @@ KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); } +#if OMPT_SUPPORT + __kmp_task_init_ompt(task, tid); +#endif + KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, team, task ) ); } @@ -937,6 +965,15 @@ KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", gtid, taskdata, taskdata->td_parent) ); +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + taskdata->ompt_task_info.task_id = __ompt_task_id_new(gtid); + taskdata->ompt_task_info.function = (void*) task_entry; + taskdata->ompt_task_info.frame = (ompt_frame_t) + { .exit_runtime_frame = NULL, .reenter_runtime_frame = NULL }; + } +#endif + return task; } @@ -984,6 +1021,19 @@ __kmp_task_start( gtid, task, current_task ); +#if OMPT_SUPPORT + ompt_thread_info_t oldInfo; + kmp_info_t * thread; + if (ompt_status & ompt_status_track) { + // Store the threads states and restore them after the task + thread = __kmp_threads[ gtid ]; + oldInfo = thread->th.ompt_thread_info; + thread->th.ompt_thread_info.wait_id = 0; + thread->th.ompt_thread_info.state = ompt_state_work_parallel; + taskdata->ompt_task_info.frame.exit_runtime_frame = __builtin_frame_address(0); + } +#endif + #if OMP_40_ENABLED // TODO: cancel tasks if the parallel region has also been cancelled // TODO: check if this sequence can be hoisted above __kmp_task_start @@ -1017,6 +1067,14 @@ } #endif // OMP_40_ENABLED + +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + thread->th.ompt_thread_info = oldInfo; + taskdata->ompt_task_info.frame.exit_runtime_frame = 0; + } +#endif + __kmp_task_finish( gtid, task, current_task ); KA_TRACE(30, ("__kmp_inovke_task(exit): T#%d completed task %p, resuming task %p\n", @@ -1073,6 +1131,13 @@ { kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task); +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + new_taskdata->ompt_task_info.frame.reenter_runtime_frame = + __builtin_frame_address(0); + } +#endif + /* Should we execute the new task or queue it? For now, let's just always try to queue it. If the queue fills up, then we'll execute it. */ @@ -1084,6 +1149,11 @@ __kmp_invoke_task( gtid, new_task, current_task ); } +#if OMPT_SUPPORT + if (ompt_status & ompt_status_track) { + new_taskdata->ompt_task_info.frame.reenter_runtime_frame = 0; + } +#endif return TASK_CURRENT_NOT_QUEUED; } Index: runtime/src/kmp_wait_release.h =================================================================== --- runtime/src/kmp_wait_release.h +++ runtime/src/kmp_wait_release.h @@ -95,6 +95,32 @@ th_gtid = this_thr->th.th_info.ds.ds_gtid; KA_TRACE(20, ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag)); +#if OMPT_SUPPORT && OMPT_BLAME + if (ompt_status == ompt_status_track_callback) { + if (this_thr->th.ompt_thread_info.state == ompt_state_idle){ + if (ompt_callbacks.ompt_callback(ompt_event_idle_begin)) { + ompt_callbacks.ompt_callback(ompt_event_idle_begin)(th_gtid + 1); + } + } else if (ompt_callbacks.ompt_callback(ompt_event_wait_barrier_begin)) { + KMP_DEBUG_ASSERT(this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier || + this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit || + this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_explicit); + + ompt_lw_taskteam_t* team = this_thr->th.th_team->t.ompt_serialized_team_info; + ompt_parallel_id_t pId; + ompt_task_id_t tId; + if (team){ + pId = team->ompt_team_info.parallel_id; + tId = team->ompt_task_info.task_id; + } else { + pId = this_thr->th.th_team->t.ompt_team_info.parallel_id; + tId = this_thr->th.th_current_task->ompt_task_info.task_id; + } + ompt_callbacks.ompt_callback(ompt_event_wait_barrier_begin)(pId, tId); + } + } +#endif + // Setup for waiting KMP_INIT_YIELD(spins); @@ -207,6 +233,33 @@ } // TODO: If thread is done with work and times out, disband/free } + +#if OMPT_SUPPORT && OMPT_BLAME + if (ompt_status == ompt_status_track_callback) { + if (this_thr->th.ompt_thread_info.state == ompt_state_idle){ + if (ompt_callbacks.ompt_callback(ompt_event_idle_end)) { + ompt_callbacks.ompt_callback(ompt_event_idle_end)(th_gtid + 1); + } + } else if (ompt_callbacks.ompt_callback(ompt_event_wait_barrier_end)) { + KMP_DEBUG_ASSERT(this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier || + this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit || + this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_explicit); + + ompt_lw_taskteam_t* team = this_thr->th.th_team->t.ompt_serialized_team_info; + ompt_parallel_id_t pId; + ompt_task_id_t tId; + if (team){ + pId = team->ompt_team_info.parallel_id; + tId = team->ompt_task_info.task_id; + } else { + pId = this_thr->th.th_team->t.ompt_team_info.parallel_id; + tId = this_thr->th.th_current_task->ompt_task_info.task_id; + } + ompt_callbacks.ompt_callback(ompt_event_wait_barrier_end)(pId, tId); + } + } +#endif + KMP_FSYNC_SPIN_ACQUIRED(spin); } Index: runtime/src/z_Linux_asm.s =================================================================== --- runtime/src/z_Linux_asm.s +++ runtime/src/z_Linux_asm.s @@ -598,6 +598,12 @@ // temp: -8(%ebp) // pushl %ebx // save %ebx to use during this routine + // +#if OMPT_SUPPORT + movl 28(%ebp),%ebx // get exit_frame address + movl %ebp,(%ebx) // save exit_frame +#endif + movl 20(%ebp),%ebx // Stack alignment - # args addl $2,%ebx // #args +2 Always pass at least 2 args (gtid and tid) shll $2,%ebx // Number of bytes used on stack: (#args+2)*4 @@ -1221,6 +1227,7 @@ // %edx: tid // %ecx: argc // %r8: p_argv +// %r9: &exit_frame // // locals: // __gtid: gtid parm pushed on stack so can pass >id to pkfn @@ -1250,6 +1257,11 @@ KMP_CFI_OFFSET rbp,-16 movq %rsp,%rbp // establish the base pointer for this routine. KMP_CFI_REGISTER rbp + +#if OMPT_SUPPORT + movq %rbp, (%r9) // save exit_frame +#endif + pushq %rbx // %rbx is callee-saved register pushq %rsi // Put gtid on stack so can pass &tgid to pkfn pushq %rdx // Put tid on stack so can pass &tid to pkfn Index: runtime/src/z_Windows_NT-586_asm.asm =================================================================== --- runtime/src/z_Windows_NT-586_asm.asm +++ runtime/src/z_Windows_NT-586_asm.asm @@ -582,6 +582,9 @@ _tid$ = 16 _argc$ = 20 _argv$ = 24 +if OMPT_SUPPORT +_exit_frame$ = 28 +endif _i$ = -8 _stk_adj$ = -16 _vptr$ = -12 @@ -595,6 +598,10 @@ push ebx push esi push edi +if OMPT_SUPPORT + mov eax, DWORD PTR _exit_frame$[ebp] + mov DWORD PTR [eax], ebp +endif ; Line 114 mov eax, DWORD PTR _argc$[ebp] mov DWORD PTR _i$[ebp], eax @@ -1307,6 +1314,9 @@ $_tid = 32 $_argc = 40 $_p_argv = 48 +if OMPT_SUPPORT +$_exit_frame = 56 +endif PUBLIC __kmp_invoke_microtask _TEXT SEGMENT @@ -1322,6 +1332,10 @@ lea rbp, QWORD PTR [rsp] ; establish the base pointer .setframe rbp, 0 .ENDPROLOG +if OMPT_SUPPORT + mov rax, QWORD PTR $_exit_frame[rbp] + mov QWORD PTR [rax], rbp +endif mov r10, rcx ; save pkfn pointer for later ;; ------------------------------------------------------------