Index: runtime/src/kmp_barrier.cpp =================================================================== --- runtime/src/kmp_barrier.cpp +++ runtime/src/kmp_barrier.cpp @@ -46,7 +46,7 @@ void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void * itt_sync_obj) ) { - KMP_TIME_BLOCK(KMP_linear_gather); + KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather); register kmp_team_t *team = this_thr->th.th_team; register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; register kmp_info_t **other_threads = team->t.t_threads; @@ -123,7 +123,7 @@ int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj) ) { - KMP_TIME_BLOCK(KMP_linear_release); + KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release); register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; register kmp_team_t *team; @@ -141,17 +141,18 @@ if (nproc > 1) { #if KMP_BARRIER_ICV_PUSH - KMP_START_EXPLICIT_TIMER(USER_icv_copy); - if (propagate_icvs) { - ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); - for (i=1; it.t_ident, team->t.t_threads[i], team, i, FALSE); - ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, - &team->t.t_implicit_task_taskdata[0].td_icvs); + { + KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); + if (propagate_icvs) { + ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); + for (i=1; it.t_ident, team->t.t_threads[i], team, i, FALSE); + ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, + &team->t.t_implicit_task_taskdata[0].td_icvs); + } + ngo_sync(); } - ngo_sync(); } - KMP_STOP_EXPLICIT_TIMER(USER_icv_copy); #endif // KMP_BARRIER_ICV_PUSH // Now, release all of the worker threads @@ -217,7 +218,7 @@ void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj) ) { - KMP_TIME_BLOCK(KMP_tree_gather); + KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather); register kmp_team_t *team = this_thr->th.th_team; register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; register kmp_info_t **other_threads = team->t.t_threads; @@ -312,7 +313,7 @@ int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj) ) { - KMP_TIME_BLOCK(KMP_tree_release); + KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release); register kmp_team_t *team; register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; register kmp_uint32 nproc; @@ -381,14 +382,15 @@ #endif /* KMP_CACHE_MANAGE */ #if KMP_BARRIER_ICV_PUSH - KMP_START_EXPLICIT_TIMER(USER_icv_copy); - if (propagate_icvs) { - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid], - team, child_tid, FALSE); - copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, - &team->t.t_implicit_task_taskdata[0].td_icvs); + { + KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); + if (propagate_icvs) { + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid], + team, child_tid, FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, + &team->t.t_implicit_task_taskdata[0].td_icvs); + } } - KMP_STOP_EXPLICIT_TIMER(USER_icv_copy); #endif // KMP_BARRIER_ICV_PUSH KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" "go(%p): %u => %u\n", gtid, team->t.t_id, tid, @@ -414,7 +416,7 @@ void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj) ) { - KMP_TIME_BLOCK(KMP_hyper_gather); + KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather); register kmp_team_t *team = this_thr->th.th_team; register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; register kmp_info_t **other_threads = team->t.t_threads; @@ -520,7 +522,7 @@ int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj) ) { - KMP_TIME_BLOCK(KMP_hyper_release); + KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release); register kmp_team_t *team; register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb; register kmp_info_t **other_threads; @@ -725,7 +727,7 @@ int gtid, int tid, void (*reduce) (void *, void *) USE_ITT_BUILD_ARG(void * itt_sync_obj) ) { - KMP_TIME_BLOCK(KMP_hier_gather); + KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather); register kmp_team_t *team = this_thr->th.th_team; register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; register kmp_uint32 nproc = this_thr->th.th_team_nproc; @@ -853,7 +855,7 @@ int propagate_icvs USE_ITT_BUILD_ARG(void * itt_sync_obj) ) { - KMP_TIME_BLOCK(KMP_hier_release); + KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release); register kmp_team_t *team; register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; register kmp_uint32 nproc; @@ -1035,7 +1037,7 @@ __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *)) { - KMP_TIME_BLOCK(KMP_barrier); + KMP_TIME_DEVELOPER_BLOCK(KMP_barrier); register int tid = __kmp_tid_from_gtid(gtid); register kmp_info_t *this_thr = __kmp_threads[gtid]; register kmp_team_t *team = this_thr->th.th_team; @@ -1294,7 +1296,7 @@ void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { - KMP_TIME_BLOCK(KMP_end_split_barrier); + KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier); int tid = __kmp_tid_from_gtid(gtid); kmp_info_t *this_thr = __kmp_threads[gtid]; kmp_team_t *team = this_thr->th.th_team; @@ -1335,7 +1337,7 @@ void __kmp_join_barrier(int gtid) { - KMP_TIME_BLOCK(KMP_join_barrier); + KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier); register kmp_info_t *this_thr = __kmp_threads[gtid]; register kmp_team_t *team; register kmp_uint nproc; @@ -1533,7 +1535,7 @@ void __kmp_fork_barrier(int gtid, int tid) { - KMP_TIME_BLOCK(KMP_fork_barrier); + KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier); kmp_info_t *this_thr = __kmp_threads[gtid]; kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; #if USE_ITT_BUILD @@ -1648,15 +1650,16 @@ this data before this function is called. We cannot modify __kmp_fork_call() to look at the fixed ICVs in the master's thread struct, because it is not always the case that the threads arrays have been allocated when __kmp_fork_call() is executed. */ - KMP_START_EXPLICIT_TIMER(USER_icv_copy); - if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs - // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. - KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, - &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs); + { + KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy); + if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs + // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. + KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs); + } } - KMP_STOP_EXPLICIT_TIMER(USER_icv_copy); #endif // KMP_BARRIER_ICV_PULL if (__kmp_tasking_mode != tskm_immediate_exec) { @@ -1702,7 +1705,7 @@ void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc ) { - KMP_TIME_BLOCK(KMP_setup_icv_copy); + KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy); KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); Index: runtime/src/kmp_cancel.cpp =================================================================== --- runtime/src/kmp_cancel.cpp +++ runtime/src/kmp_cancel.cpp @@ -58,7 +58,7 @@ break; } case cancel_taskgroup: - // cancellation requests for parallel and worksharing constructs + // cancellation requests for a task group // are handled through the taskgroup structure { kmp_taskdata_t* task; @@ -141,7 +141,7 @@ break; } case cancel_taskgroup: - // cancellation requests for parallel and worksharing constructs + // cancellation requests for a task group // are handled through the taskgroup structure { kmp_taskdata_t* task; Index: runtime/src/kmp_csupport.c =================================================================== --- runtime/src/kmp_csupport.c +++ runtime/src/kmp_csupport.c @@ -280,9 +280,21 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { - KMP_STOP_EXPLICIT_TIMER(OMP_serial); - KMP_COUNT_BLOCK(OMP_PARALLEL); int gtid = __kmp_entry_gtid(); + +#if (KMP_STATS_ENABLED) + int inParallel = __kmpc_in_parallel(loc); + if (inParallel) + { + KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); + } + else + { + KMP_STOP_EXPLICIT_TIMER(OMP_serial); + KMP_COUNT_BLOCK(OMP_PARALLEL); + } +#endif + // maybe to save thr_state is enough here { va_list ap; @@ -329,7 +341,10 @@ } #endif } - KMP_START_EXPLICIT_TIMER(OMP_serial); +#if (KMP_STATS_ENABLED) + if (!inParallel) + KMP_START_EXPLICIT_TIMER(OMP_serial); +#endif } #if OMP_40_ENABLED @@ -370,6 +385,8 @@ va_list ap; va_start( ap, microtask ); + KMP_COUNT_BLOCK(OMP_TEAMS); + // remember teams entry point and nesting level this_thr->th.th_teams_microtask = microtask; this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host @@ -2191,7 +2208,6 @@ __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) { KMP_COUNT_BLOCK(OMP_test_lock); - KMP_TIME_BLOCK(OMP_test_lock); #if KMP_USE_DYNAMIC_LOCK int rc; Index: runtime/src/kmp_dispatch.cpp =================================================================== --- runtime/src/kmp_dispatch.cpp +++ runtime/src/kmp_dispatch.cpp @@ -670,6 +670,7 @@ } else { pr->ordered = FALSE; } + if ( schedule == kmp_sch_static ) { schedule = __kmp_static; } else { @@ -761,6 +762,19 @@ tc = 0; // zero-trip } + // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing + // when statistics are disabled. + if (schedule == __kmp_static) + { + KMP_COUNT_BLOCK(OMP_FOR_static); + KMP_COUNT_VALUE(FOR_static_iterations, tc); + } + else + { + KMP_COUNT_BLOCK(OMP_FOR_dynamic); + KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); + } + pr->u.p.lb = lb; pr->u.p.ub = ub; pr->u.p.st = st; @@ -1384,6 +1398,11 @@ static const int ___kmp_size_type = sizeof( UT ); #endif + // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule + // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs + // more than a compile time choice to use static scheduling would.) + KMP_TIME_BLOCK(FOR_dynamic_scheduling); + int status; dispatch_private_info_template< T > * pr; kmp_info_t * th = __kmp_threads[ gtid ]; @@ -2164,7 +2183,6 @@ T *pupper, typename traits_t< T >::signed_t incr ) { - KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic); typedef typename traits_t< T >::unsigned_t UT; typedef typename traits_t< T >::signed_t ST; register kmp_uint32 team_id; @@ -2222,6 +2240,7 @@ } else { trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case } + if( trip_count <= nteams ) { KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy || \ @@ -2297,7 +2316,6 @@ __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); KMP_DEBUG_ASSERT( __kmp_init_serial ); __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); } @@ -2308,7 +2326,6 @@ __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); KMP_DEBUG_ASSERT( __kmp_init_serial ); __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); } @@ -2321,7 +2338,6 @@ kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); KMP_DEBUG_ASSERT( __kmp_init_serial ); __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); } @@ -2334,7 +2350,6 @@ kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); KMP_DEBUG_ASSERT( __kmp_init_serial ); __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); } @@ -2352,7 +2367,6 @@ __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); KMP_DEBUG_ASSERT( __kmp_init_serial ); __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); @@ -2362,7 +2376,6 @@ __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); KMP_DEBUG_ASSERT( __kmp_init_serial ); __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); @@ -2372,7 +2385,6 @@ __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); KMP_DEBUG_ASSERT( __kmp_init_serial ); __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); @@ -2382,7 +2394,6 @@ __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); KMP_DEBUG_ASSERT( __kmp_init_serial ); __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); Index: runtime/src/kmp_runtime.c =================================================================== --- runtime/src/kmp_runtime.c +++ runtime/src/kmp_runtime.c @@ -1495,7 +1495,8 @@ kmp_hot_team_ptr_t **p_hot_teams; #endif { // KMP_TIME_BLOCK - KMP_TIME_BLOCK(KMP_fork_call); + KMP_TIME_DEVELOPER_BLOCK(KMP_fork_call); + KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid )); if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) { @@ -1620,12 +1621,14 @@ } #endif - KMP_TIME_BLOCK(OMP_work); - __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv + { + KMP_TIME_BLOCK(OMP_work); + __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv #if OMPT_SUPPORT - , exit_runtime_p + , exit_runtime_p #endif - ); + ); + } #if OMPT_SUPPORT if (ompt_status & ompt_status_track) { @@ -2224,8 +2227,8 @@ } // END of timer KMP_fork_call block { - //KMP_TIME_BLOCK(OMP_work); - KMP_TIME_BLOCK(USER_master_invoke); + KMP_TIME_BLOCK(OMP_work); + // KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke); if (! team->t.t_invoke( gtid )) { KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" ); } @@ -2280,7 +2283,7 @@ #endif /* OMP_40_ENABLED */ ) { - KMP_TIME_BLOCK(KMP_join_call); + KMP_TIME_DEVELOPER_BLOCK(KMP_join_call); kmp_team_t *team; kmp_team_t *parent_team; kmp_info_t *master_th; @@ -2582,6 +2585,7 @@ else if (new_nth > __kmp_max_nth) new_nth = __kmp_max_nth; + KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); thread = __kmp_threads[gtid]; __kmp_save_internal_controls( thread ); @@ -4790,7 +4794,7 @@ kmp_internal_control_t *new_icvs, int argc USE_NESTED_HOT_ARG(kmp_info_t *master) ) { - KMP_TIME_BLOCK(KMP_allocate_team); + KMP_TIME_DEVELOPER_BLOCK(KMP_allocate_team); int f; kmp_team_t *team; int use_hot_team = ! root->r.r_active; @@ -5577,12 +5581,12 @@ } #endif - KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop); + KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop); { - KMP_TIME_BLOCK(USER_worker_invoke); + KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke); rc = (*pteam)->t.t_invoke( gtid ); } - KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop); + KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop); KMP_ASSERT( rc ); #if OMPT_SUPPORT @@ -6910,12 +6914,15 @@ #endif #endif - rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn), - gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv + { + KMP_TIME_BLOCK(OMP_work); + rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn), + gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv #if OMPT_SUPPORT - , exit_runtime_p + , exit_runtime_p #endif - ); + ); + } #if OMPT_SUPPORT && OMPT_TRACE if (ompt_status & ompt_status_track) { Index: runtime/src/kmp_sched.cpp =================================================================== --- runtime/src/kmp_sched.cpp +++ runtime/src/kmp_sched.cpp @@ -84,6 +84,8 @@ typename traits_t< T >::signed_t chunk ) { KMP_COUNT_BLOCK(OMP_FOR_static); + KMP_TIME_BLOCK (FOR_static_scheduling); + typedef typename traits_t< T >::unsigned_t UT; typedef typename traits_t< T >::signed_t ST; /* this all has to be changed back to TID and such.. */ @@ -151,6 +153,7 @@ team_info->microtask); } #endif + KMP_COUNT_VALUE (FOR_static_iterations, 0); return; } @@ -246,6 +249,7 @@ __kmp_error_construct( kmp_i18n_msg_CnsIterationRangeTooLarge, ct_pdo, loc ); } } + KMP_COUNT_VALUE (FOR_static_iterations, trip_count); /* compute remaining parameters */ switch ( schedtype ) { @@ -372,7 +376,7 @@ typename traits_t< T >::signed_t incr, typename traits_t< T >::signed_t chunk ) { - KMP_COUNT_BLOCK(OMP_DISTR_FOR_static); + KMP_COUNT_BLOCK(OMP_DISTRIBUTE); typedef typename traits_t< T >::unsigned_t UT; typedef typename traits_t< T >::signed_t ST; register kmp_uint32 tid; @@ -437,6 +441,7 @@ } else { trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case } + *pstride = *pupper - *plower; // just in case (can be unused) if( trip_count <= nteams ) { KMP_DEBUG_ASSERT( Index: runtime/src/kmp_stats.h =================================================================== --- runtime/src/kmp_stats.h +++ runtime/src/kmp_stats.h @@ -31,6 +31,11 @@ #include // placement new #include "kmp_stats_timing.h" +/* + * Enable developer statistics here if you want them. They are more detailed than is useful for application characterisation and + * are intended for the runtime library developer. + */ +// #define KMP_DEVELOPER_STATS 1 /*! * @ingroup STATS_GATHERING @@ -56,7 +61,7 @@ * Each thread accumulates its own count, at the end of execution the counts are aggregated treating each thread * as a separate measurement. (Unless onlyInMaster is set, in which case there's only a single measurement). * The min,mean,max are therefore the values for the threads. - * Adding the counter here and then putting in a KMP_BLOCK_COUNTER(name) is all you need to do. + * Adding the counter here and then putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you need to do. * All of the tables and printing is generated from this macro. * Format is "macro(name, flags, arg)" * @@ -64,21 +69,30 @@ */ #define KMP_FOREACH_COUNTER(macro, arg) \ macro (OMP_PARALLEL, stats_flags_e::onlyInMaster, arg) \ + macro (OMP_NESTED_PARALLEL, 0, arg) \ macro (OMP_FOR_static, 0, arg) \ macro (OMP_FOR_dynamic, 0, arg) \ - macro (OMP_DISTR_FOR_static, 0, arg) \ - macro (OMP_DISTR_FOR_dynamic, 0, arg) \ + macro (OMP_DISTRIBUTE, 0, arg) \ macro (OMP_BARRIER, 0, arg) \ macro (OMP_CRITICAL,0, arg) \ macro (OMP_SINGLE, 0, arg) \ macro (OMP_MASTER, 0, arg) \ + macro (OMP_TEAMS, 0, arg) \ macro (OMP_set_lock, 0, arg) \ macro (OMP_test_lock, 0, arg) \ - macro (OMP_test_lock_failure, 0, arg) \ macro (REDUCE_wait, 0, arg) \ macro (REDUCE_nowait, 0, arg) \ + macro (OMP_TASKYIELD, 0, arg) \ + macro (TASK_executed, 0, arg) \ + macro (TASK_cancelled, 0, arg) \ + macro (TASK_stolen, 0, arg) \ macro (LAST,0,arg) +// OMP_PARALLEL_args -- the number of arguments passed to a fork +// FOR_static_iterations -- Number of available parallel chunks of work in a static for +// FOR_dynamic_iterations -- Number of available parallel chunks of work in a dynamic for +// Both adjust for any chunking, so if there were an iteration count of 20 but a chunk size of 10, we'd record 2. + /*! * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h * @@ -87,72 +101,43 @@ * * \details A timer collects multiple samples of some count in each thread and then finally aggregates over all the threads. * The count is normally a time (in ticks), hence the name "timer". (But can be any value, so we use this for "number of arguments passed to fork" - * as well, or we could collect "loop iteration count" if we wanted to). + * as well). * For timers the threads are not significant, it's the individual observations that count, so the statistics are at that level. * Format is "macro(name, flags, arg)" * - * @ingroup STATS_GATHERING + * @ingroup STATS_GATHERING2 */ -#define KMP_FOREACH_TIMER(macro, arg) \ - macro (OMP_PARALLEL_args, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \ - macro (FOR_static_iterations, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \ - macro (FOR_dynamic_iterations, stats_flags_e::noUnits, arg) \ +#define KMP_FOREACH_TIMER(macro, arg) \ macro (OMP_start_end, stats_flags_e::onlyInMaster, arg) \ macro (OMP_serial, stats_flags_e::onlyInMaster, arg) \ macro (OMP_work, 0, arg) \ macro (Total_work, stats_flags_e::synthesized, arg) \ - macro (OMP_await_work, stats_flags_e::notInMaster, arg) \ - macro (Total_await_work, stats_flags_e::synthesized, arg) \ macro (OMP_barrier, 0, arg) \ macro (Total_barrier, stats_flags_e::synthesized, arg) \ - macro (OMP_test_lock, 0, arg) \ + macro (FOR_static_iterations, stats_flags_e::noUnits, arg) \ macro (FOR_static_scheduling, 0, arg) \ + macro (FOR_dynamic_iterations, stats_flags_e::noUnits, arg) \ macro (FOR_dynamic_scheduling, 0, arg) \ - macro (KMP_fork_call, 0, arg) \ - macro (KMP_join_call, 0, arg) \ - macro (KMP_fork_barrier, stats_flags_e::logEvent, arg) \ - macro (KMP_join_barrier, stats_flags_e::logEvent, arg) \ - macro (KMP_barrier, 0, arg) \ - macro (KMP_end_split_barrier, 0, arg) \ - macro (KMP_wait_sleep, 0, arg) \ - macro (KMP_release, 0, arg) \ - macro (KMP_hier_gather, 0, arg) \ - macro (KMP_hier_release, 0, arg) \ - macro (KMP_hyper_gather, stats_flags_e::logEvent, arg) \ - macro (KMP_hyper_release, stats_flags_e::logEvent, arg) \ - macro (KMP_linear_gather, 0, arg) \ - macro (KMP_linear_release, 0, arg) \ - macro (KMP_tree_gather, 0, arg) \ - macro (KMP_tree_release, 0, arg) \ - macro (USER_master_invoke, stats_flags_e::logEvent, arg) \ - macro (USER_worker_invoke, stats_flags_e::logEvent, arg) \ - macro (USER_resume, stats_flags_e::logEvent, arg) \ - macro (USER_suspend, stats_flags_e::logEvent, arg) \ - macro (USER_launch_thread_loop, stats_flags_e::logEvent, arg) \ - macro (KMP_allocate_team, 0, arg) \ - macro (KMP_setup_icv_copy, 0, arg) \ - macro (USER_icv_copy, 0, arg) \ + macro (TASK_execution, 0, arg) \ + macro (OMP_set_numthreads, stats_flags_e::noUnits, arg) \ + macro (OMP_PARALLEL_args, stats_flags_e::noUnits, arg) \ + KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ macro (LAST,0, arg) - -// OMP_PARALLEL_args -- the number of arguments passed to a fork -// FOR_static_iterations -- Number of available parallel chunks of work in a static for -// FOR_dynamic_iterations -- Number of available parallel chunks of work in a dynamic for -// Both adjust for any chunking, so if there were an iteration count of 20 but a chunk size of 10, we'd record 2. -// OMP_serial -- thread zero time executing serial code // OMP_start_end -- time from when OpenMP is initialized until the stats are printed at exit +// OMP_serial -- thread zero time executing serial code // OMP_work -- elapsed time in code dispatched by a fork (measured in the thread) // Total_work -- a synthesized statistic summarizing how much parallel work each thread executed. // OMP_barrier -- time at "real" barriers // Total_barrier -- a synthesized statistic summarizing how much time at real barriers in each thread -// OMP_set_lock -- time in lock setting -// OMP_test_lock -- time in testing a lock -// LOCK_WAIT -- time waiting for a lock // FOR_static_scheduling -- time spent doing scheduling for a static "for" // FOR_dynamic_scheduling -- time spent doing scheduling for a dynamic "for" -// KMP_wait_sleep -- time in __kmp_wait_sleep -// KMP_release -- time in __kmp_release + +#if (KMP_DEVELOPER_STATS) +// Timers which are of interest tio runtime library developers, not end users. +// THese have to be explicitly enabled in addition to the other stats. + // KMP_fork_barrier -- time in __kmp_fork_barrier // KMP_join_barrier -- time in __kmp_join_barrier // KMP_barrier -- time in __kmp_barrier @@ -165,6 +150,32 @@ // KMP_tree_release -- time in __kmp_tree_barrier_release // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather // KMP_hyper_release -- time in __kmp_hyper_barrier_release +# define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ + macro (KMP_fork_call, 0, arg) \ + macro (KMP_join_call, 0, arg) \ + macro (KMP_fork_barrier, stats_flags_e::logEvent, arg) \ + macro (KMP_join_barrier, stats_flags_e::logEvent, arg) \ + macro (KMP_barrier, 0, arg) \ + macro (KMP_end_split_barrier, 0, arg) \ + macro (KMP_hier_gather, 0, arg) \ + macro (KMP_hier_release, 0, arg) \ + macro (KMP_hyper_gather, stats_flags_e::logEvent, arg) \ + macro (KMP_hyper_release, stats_flags_e::logEvent, arg) \ + macro (KMP_linear_gather, 0, arg) \ + macro (KMP_linear_release, 0, arg) \ + macro (KMP_tree_gather, 0, arg) \ + macro (KMP_tree_release, 0, arg) \ + macro (USER_master_invoke, stats_flags_e::logEvent, arg) \ + macro (USER_worker_invoke, stats_flags_e::logEvent, arg) \ + macro (USER_resume, stats_flags_e::logEvent, arg) \ + macro (USER_suspend, stats_flags_e::logEvent, arg) \ + macro (USER_launch_thread_loop, stats_flags_e::logEvent, arg) \ + macro (KMP_allocate_team, 0, arg) \ + macro (KMP_setup_icv_copy, 0, arg) \ + macro (USER_icv_copy, 0, arg) +#else +# define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) +#endif /*! * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro. @@ -182,13 +193,19 @@ * * @ingroup STATS_GATHERING */ -#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \ - macro(OMP_serial, 0, arg) \ - macro(OMP_start_end, 0, arg) \ - macro(USER_icv_copy, 0, arg) \ - macro(USER_launch_thread_loop, stats_flags_e::logEvent, arg) \ +#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \ + macro(OMP_serial, 0, arg) \ + macro(OMP_start_end, 0, arg) \ + KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro,arg) \ macro(LAST, 0, arg) +#if (KMP_DEVELOPER_STATS) +# define KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro, arg) \ + macro(USER_launch_thread_loop, stats_flags_e::logEvent, arg) +#else +# define KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro, arg) +#endif + #define ENUMERATE(name,ignore,prefix) prefix##name, enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) @@ -689,6 +706,21 @@ */ #define KMP_RESET_STATS() __kmp_reset_stats() +#if (KMP_DEVELOPER_STATS) +# define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n) +# define KMP_COUNT_DEVELOPER_VALUE(n,v) KMP_COUNT_VALUE(n,v) +# define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n) +# define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n) +# define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n) +#else +// Null definitions +# define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0) +# define KMP_COUNT_DEVELOPER_VALUE(n,v) ((void)0) +# define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) +# define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) +# define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) +#endif + #else // KMP_STATS_ENABLED // Null definitions @@ -701,6 +733,11 @@ #define KMP_OUTPUT_STATS(heading_string) ((void)0) #define KMP_RESET_STATS() ((void)0) +#define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0) +#define KMP_COUNT_DEVELOPER_VALUE(n,v) ((void)0) +#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) +#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) +#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) #endif // KMP_STATS_ENABLED #endif // KMP_STATS_H Index: runtime/src/kmp_stats.cpp =================================================================== --- runtime/src/kmp_stats.cpp +++ runtime/src/kmp_stats.cpp @@ -521,16 +521,14 @@ // Special handling for synthesized statistics. // These just have to be coded specially here for now. - // At present we only have one: the total parallel work done in each thread. + // At present we only have a few: + // The total parallel work done in each thread. // The variance here makes it easy to see load imbalance over the whole program (though, of course, // it's possible to have a code with awful load balance in every parallel region but perfect load // balance oever the whole program.) + // The time spent in barriers in each thread. allStats[TIMER_Total_work].addSample ((*it)->getTimer(TIMER_OMP_work)->getTotal()); - // Time waiting for work (synthesized) - if ((t != 0) || !timeStat::workerOnly(timer_e(TIMER_OMP_await_work))) - allStats[TIMER_Total_await_work].addSample ((*it)->getTimer(TIMER_OMP_await_work)->getTotal()); - // Time in explicit barriers. allStats[TIMER_Total_barrier].addSample ((*it)->getTimer(TIMER_OMP_barrier)->getTotal()); Index: runtime/src/kmp_tasking.c =================================================================== --- runtime/src/kmp_tasking.c +++ runtime/src/kmp_tasking.c @@ -17,6 +17,7 @@ #include "kmp_i18n.h" #include "kmp_itt.h" #include "kmp_wait_release.h" +#include "kmp_stats.h" #if OMPT_SUPPORT #include "ompt-specific.h" @@ -1136,6 +1137,7 @@ kmp_team_t * this_team = this_thr->th.th_team; kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup; if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) { + KMP_COUNT_BLOCK(TASK_cancelled); // this task belongs to a task group and we need to cancel it discard = 1 /* true */; } @@ -1146,6 +1148,8 @@ // Thunks generated by gcc take a different argument list. // if (!discard) { + KMP_COUNT_BLOCK(TASK_executed); + KMP_TIME_BLOCK (TASK_execution); #endif // OMP_40_ENABLED #ifdef KMP_GOMP_COMPAT if (taskdata->td_flags.native) { @@ -1356,6 +1360,8 @@ kmp_info_t * thread; int thread_finished = FALSE; + KMP_COUNT_BLOCK(OMP_TASKYIELD); + KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", gtid, loc_ref, end_part) ); @@ -1648,6 +1654,7 @@ __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock ); + KMP_COUNT_BLOCK(TASK_stolen); KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p " "ntasks=%d head=%u tail=%u\n", gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team, Index: runtime/src/z_Linux_util.c =================================================================== --- runtime/src/z_Linux_util.c +++ runtime/src/z_Linux_util.c @@ -1689,7 +1689,7 @@ template static inline void __kmp_suspend_template( int th_gtid, C *flag ) { - KMP_TIME_BLOCK(USER_suspend); + KMP_TIME_DEVELOPER_BLOCK(USER_suspend); kmp_info_t *th = __kmp_threads[th_gtid]; int status; typename C::flag_t old_spin; @@ -1901,7 +1901,7 @@ void __kmp_resume_monitor() { - KMP_TIME_BLOCK(USER_resume); + KMP_TIME_DEVELOPER_BLOCK(USER_resume); int status; #ifdef KMP_DEBUG int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;