Index: openmp/trunk/runtime/src/kmp_barrier.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_barrier.cpp +++ openmp/trunk/runtime/src/kmp_barrier.cpp @@ -1048,6 +1048,8 @@ void *reduce_data, void (*reduce)(void *, void *)) { KMP_TIME_DEVELOPER_BLOCK(KMP_barrier); + KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); + KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); register int tid = __kmp_tid_from_gtid(gtid); register kmp_info_t *this_thr = __kmp_threads[gtid]; register kmp_team_t *team = this_thr->th.th_team; @@ -1348,6 +1350,8 @@ void __kmp_join_barrier(int gtid) { + KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier); + KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier); register kmp_info_t *this_thr = __kmp_threads[gtid]; register kmp_team_t *team; @@ -1463,6 +1467,18 @@ __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj) ); } +#if KMP_STATS_ENABLED + // Have master thread flag the workers to indicate they are now waiting for + // next parallel region, Also wake them up so they switch their timers to idle. + for (int i=0; it.t_nproc; ++i) { + kmp_info_t* team_thread = team->t.t_threads[i]; + if (team_thread == this_thr) + continue; + team_thread->th.th_stats->setIdleFlag(); + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL) + __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc); + } +#endif #if USE_ITT_BUILD if (__itt_sync_create_ptr || KMP_ITT_DEBUG) __kmp_itt_barrier_middle(gtid, itt_sync_obj); @@ -1546,6 +1562,8 @@ void __kmp_fork_barrier(int gtid, int tid) { + KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier); + KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier); kmp_info_t *this_thr = __kmp_threads[gtid]; kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; Index: openmp/trunk/runtime/src/kmp_csupport.c =================================================================== --- openmp/trunk/runtime/src/kmp_csupport.c +++ openmp/trunk/runtime/src/kmp_csupport.c @@ -290,7 +290,6 @@ } else { - KMP_STOP_EXPLICIT_TIMER(OMP_serial); KMP_COUNT_BLOCK(OMP_PARALLEL); } #endif @@ -345,10 +344,6 @@ } #endif } -#if (KMP_STATS_ENABLED) - if (!inParallel) - KMP_START_EXPLICIT_TIMER(OMP_serial); -#endif } #if OMP_40_ENABLED @@ -669,7 +664,6 @@ __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { KMP_COUNT_BLOCK(OMP_BARRIER); - KMP_TIME_BLOCK(OMP_barrier); KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) ); if (! TCR_4(__kmp_init_parallel)) @@ -713,7 +707,7 @@ if( KMP_MASTER_GTID( global_tid )) { KMP_COUNT_BLOCK(OMP_MASTER); - KMP_START_EXPLICIT_TIMER(OMP_master); + KMP_PUSH_PARTITIONED_TIMER(OMP_master); status = 1; } @@ -763,7 +757,7 @@ KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) ); KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid )); - KMP_STOP_EXPLICIT_TIMER(OMP_master); + KMP_POP_PARTITIONED_TIMER(); #if OMPT_SUPPORT && OMPT_TRACE kmp_info_t *this_thr = __kmp_threads[ global_tid ]; @@ -1088,7 +1082,7 @@ __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); #else KMP_COUNT_BLOCK(OMP_CRITICAL); - KMP_TIME_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */ + KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */ kmp_user_lock_p lck; KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) ); @@ -1250,6 +1244,7 @@ __kmp_itt_critical_acquired( lck ); #endif /* USE_ITT_BUILD */ + KMP_PUSH_PARTITIONED_TIMER(OMP_critical); KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid )); } // __kmpc_critical_with_hint @@ -1342,7 +1337,7 @@ #endif #endif // KMP_USE_DYNAMIC_LOCK - KMP_STOP_EXPLICIT_TIMER(OMP_critical); + KMP_POP_PARTITIONED_TIMER(); KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid )); } @@ -1464,7 +1459,7 @@ if (rc) { // We are going to execute the single statement, so we should count it. KMP_COUNT_BLOCK(OMP_SINGLE); - KMP_START_EXPLICIT_TIMER(OMP_single); + KMP_PUSH_PARTITIONED_TIMER(OMP_single); } #if OMPT_SUPPORT && OMPT_TRACE @@ -1507,7 +1502,7 @@ __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { __kmp_exit_single( global_tid ); - KMP_STOP_EXPLICIT_TIMER(OMP_single); + KMP_POP_PARTITIONED_TIMER(); #if OMPT_SUPPORT && OMPT_TRACE kmp_info_t *this_thr = __kmp_threads[ global_tid ]; Index: openmp/trunk/runtime/src/kmp_dispatch.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_dispatch.cpp +++ openmp/trunk/runtime/src/kmp_dispatch.cpp @@ -1424,7 +1424,7 @@ // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs // more than a compile time choice to use static scheduling would.) - KMP_TIME_BLOCK(FOR_dynamic_scheduling); + KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); int status; dispatch_private_info_template< T > * pr; Index: openmp/trunk/runtime/src/kmp_runtime.c =================================================================== --- openmp/trunk/runtime/src/kmp_runtime.c +++ openmp/trunk/runtime/src/kmp_runtime.c @@ -1543,7 +1543,8 @@ #endif { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv #if OMPT_SUPPORT , exit_runtime_p @@ -1618,7 +1619,8 @@ gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) ); { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); if (! parent_team->t.t_invoke( gtid )) { KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" ); } @@ -1738,7 +1740,8 @@ #endif { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv #if OMPT_SUPPORT , exit_runtime_p @@ -1795,7 +1798,8 @@ team->t.t_level--; // AC: call special invoker for outer "parallel" of the teams construct { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); invoker(gtid); } } else { @@ -1842,7 +1846,8 @@ #endif { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); __kmp_invoke_microtask( microtask, gtid, 0, argc, args #if OMPT_SUPPORT , exit_runtime_p @@ -2178,7 +2183,8 @@ } // END of timer KMP_fork_call block { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); // KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke); if (! team->t.t_invoke( gtid )) { KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" ); @@ -5448,6 +5454,8 @@ KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop); { KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); rc = (*pteam)->t.t_invoke( gtid ); } KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop); @@ -6783,7 +6791,8 @@ #endif { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn), gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv #if OMPT_SUPPORT Index: openmp/trunk/runtime/src/kmp_sched.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_sched.cpp +++ openmp/trunk/runtime/src/kmp_sched.cpp @@ -84,7 +84,7 @@ typename traits_t< T >::signed_t chunk ) { KMP_COUNT_BLOCK(OMP_FOR_static); - KMP_TIME_BLOCK (FOR_static_scheduling); + KMP_TIME_PARTITIONED_BLOCK(FOR_static_scheduling); typedef typename traits_t< T >::unsigned_t UT; typedef typename traits_t< T >::signed_t ST; Index: openmp/trunk/runtime/src/kmp_stats.h =================================================================== --- openmp/trunk/runtime/src/kmp_stats.h +++ openmp/trunk/runtime/src/kmp_stats.h @@ -27,6 +27,7 @@ #include #include +#include #include #include #include // placement new @@ -52,6 +53,23 @@ }; /*! + * @ingroup STATS_GATHERING + * \brief the states which a thread can be in + * + */ +enum stats_state_e { + IDLE, + SERIAL_REGION, + FORK_JOIN_BARRIER, + PLAIN_BARRIER, + TASKWAIT, + TASKYIELD, + TASKGROUP, + IMPLICIT_TASK, + EXPLICIT_TASK +}; + +/*! * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h * * @param macro a user defined macro that takes three arguments - macro(COUNTER_NAME, flags, arg) @@ -103,18 +121,25 @@ * * @ingroup STATS_GATHERING2 */ -#define KMP_FOREACH_TIMER(macro, arg) \ - macro (OMP_start_end, stats_flags_e::onlyInMaster | stats_flags_e::noTotal, arg) \ - macro (OMP_serial, stats_flags_e::onlyInMaster | stats_flags_e::noTotal, arg) \ - macro (OMP_work, 0, arg) \ - macro (OMP_barrier, 0, arg) \ - macro (FOR_static_scheduling, 0, arg) \ - macro (FOR_dynamic_scheduling, 0, arg) \ - macro (OMP_task, 0, arg) \ - macro (OMP_critical, 0, arg) \ - macro (OMP_critical_wait, 0, arg) \ - macro (OMP_single, 0, arg) \ - macro (OMP_master, 0, arg) \ +#define KMP_FOREACH_TIMER(macro, arg) \ + macro (OMP_worker_thread_life, 0, arg) \ + macro (FOR_static_scheduling, 0, arg) \ + macro (FOR_dynamic_scheduling, 0, arg) \ + macro (OMP_critical, 0, arg) \ + macro (OMP_critical_wait, 0, arg) \ + macro (OMP_single, 0, arg) \ + macro (OMP_master, 0, arg) \ + macro (OMP_idle, 0, arg) \ + macro (OMP_plain_barrier, 0, arg) \ + macro (OMP_fork_join_barrier, 0, arg) \ + macro (OMP_parallel, 0, arg) \ + macro (OMP_task_immediate, 0, arg) \ + macro (OMP_task_taskwait, 0, arg) \ + macro (OMP_task_taskyield, 0, arg) \ + macro (OMP_task_taskgroup, 0, arg) \ + macro (OMP_task_join_bar, 0, arg) \ + macro (OMP_task_plain_bar, 0, arg) \ + macro (OMP_serial, 0, arg) \ macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ macro (FOR_static_iterations, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ @@ -129,7 +154,16 @@ // OMP_barrier -- Time at "real" barriers (includes task time) // FOR_static_scheduling -- Time spent doing scheduling for a static "for" // FOR_dynamic_scheduling -- Time spent doing scheduling for a dynamic "for" -// OMP_task -- Time spent executing tasks +// OMP_idle -- Worker threads time spent waiting for inclusion in a parallel region +// OMP_plain_barrier -- Time spent in a barrier construct +// OMP_fork_join_barrier -- Time spent in a the fork-join barrier surrounding a parallel region +// OMP_parallel -- Time spent inside a parallel construct +// OMP_task_immediate -- Time spent executing non-deferred tasks +// OMP_task_taskwait -- Time spent executing tasks inside a taskwait construct +// OMP_task_taskyield -- Time spent executing tasks inside a taskyield construct +// OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup construct +// OMP_task_join_bar -- Time spent executing tasks inside a join barrier +// OMP_task_plain_bar -- Time spent executing tasks inside a barrier construct // OMP_single -- Time spent executing a "single" region // OMP_master -- Time spent executing a "master" region // OMP_set_numthreads -- Values passed to omp_set_num_threads @@ -197,12 +231,25 @@ * * @ingroup STATS_GATHERING */ -#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \ - macro(OMP_serial, 0, arg) \ - macro(OMP_start_end, 0, arg) \ - macro(OMP_critical, 0, arg) \ - macro(OMP_single, 0, arg) \ - macro(OMP_master, 0, arg) \ +#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \ + macro(OMP_worker_thread_life, 0, arg) \ + macro(FOR_static_scheduling, 0, arg) \ + macro(FOR_dynamic_scheduling, 0, arg) \ + macro(OMP_critical, 0, arg) \ + macro(OMP_critical_wait, 0, arg) \ + macro(OMP_single, 0, arg) \ + macro(OMP_master, 0, arg) \ + macro(OMP_idle, 0, arg) \ + macro(OMP_plain_barrier, 0, arg) \ + macro(OMP_fork_join_barrier, 0, arg) \ + macro(OMP_parallel, 0, arg) \ + macro(OMP_task_immediate, 0, arg) \ + macro(OMP_task_taskwait, 0, arg) \ + macro(OMP_task_taskyield, 0, arg) \ + macro(OMP_task_taskgroup, 0, arg) \ + macro(OMP_task_join_bar, 0, arg) \ + macro(OMP_task_plain_bar, 0, arg) \ + macro(OMP_serial, 0, arg) \ KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro,arg) \ macro(LAST, 0, arg) @@ -227,6 +274,21 @@ }; #undef ENUMERATE +class timerPair { + explicit_timer_e timer_index; + timer_e timer; + public: + timerPair(explicit_timer_e ti, timer_e t) : timer_index(ti), timer(t) {} + inline explicit_timer_e get_index() const { return timer_index; } + inline timer_e get_timer() const { return timer; } + bool operator==(const timerPair & rhs) { + return this->get_index() == rhs.get_index(); + } + bool operator!=(const timerPair & rhs) { + return !(*this == rhs); + } +}; + class statistic { double minVal; @@ -294,15 +356,19 @@ { timeStat * stat; tsc_tick_count startTime; + tsc_tick_count pauseStartTime; + tsc_tick_count::tsc_interval_t totalPauseTime; public: - explicitTimer () : stat(0), startTime(0) { } - explicitTimer (timeStat * s) : stat(s), startTime() { } + explicitTimer () : stat(0), startTime(0), pauseStartTime(0), totalPauseTime() { } + explicitTimer (timeStat * s) : stat(s), startTime(), pauseStartTime(0), totalPauseTime() { } void setStat (timeStat *s) { stat = s; } void start(timer_e timerEnumValue); + void pause() { pauseStartTime = tsc_tick_count::now(); } + void resume() { totalPauseTime += (tsc_tick_count::now() - pauseStartTime); } void stop(timer_e timerEnumValue); - void reset() { startTime = 0; } + void reset() { startTime = 0; pauseStartTime = 0; totalPauseTime = 0; } }; // Where all you need is to time a block, this is enough. @@ -315,6 +381,49 @@ ~blockTimer() { stop(timerEnumValue); } }; +// Where you need to partition a threads clock ticks into separate states +// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and +// DOING_NOTHING would render these conditions: +// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive +// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice versa +class partitionedTimers +{ + private: + explicitTimer* timers[EXPLICIT_TIMER_LAST+1]; + std::vector timer_stack; + public: + partitionedTimers(); + void add_timer(explicit_timer_e timer_index, explicitTimer* timer_pointer); + void init(timerPair timer_index); + void push(timerPair timer_index); + void pop(); + void windup(); +}; + +// Special wrapper around the partioned timers to aid timing code blocks +// It avoids the need to have an explicit end, leaving the scope suffices. +class blockPartitionedTimer +{ + partitionedTimers* part_timers; + timerPair timer_pair; + public: + blockPartitionedTimer(partitionedTimers* pt, timerPair tp) : part_timers(pt), timer_pair(tp) { part_timers->push(timer_pair); } + ~blockPartitionedTimer() { part_timers->pop(); } +}; + +// Special wrapper around the thread state to aid in keeping state in code blocks +// It avoids the need to have an explicit end, leaving the scope suffices. +class blockThreadState +{ + stats_state_e* state_pointer; + stats_state_e old_state; + public: + blockThreadState(stats_state_e* thread_state_pointer, stats_state_e new_state) : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { + *state_pointer = new_state; + } + ~blockThreadState() { *state_pointer = old_state; } +}; + // If all you want is a count, then you can use this... // The individual per-thread counts will be aggregated into a statistic at program exit. class counter @@ -473,14 +582,19 @@ timeStat _timers[TIMER_LAST+1]; counter _counters[COUNTER_LAST+1]; explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST+1]; + partitionedTimers _partitionedTimers; int _nestLevel; // one per thread kmp_stats_event_vector _event_vector; kmp_stats_list* next; kmp_stats_list* prev; + stats_state_e state; + int thread_is_idle_flag; public: - kmp_stats_list() : next(this) , prev(this) , _event_vector(), _nestLevel(0) { + kmp_stats_list() : _nestLevel(0), _event_vector(), next(this), prev(this), + state(IDLE), thread_is_idle_flag(0) { #define doInit(name,ignore1,ignore2) \ - getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name)); + getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name)); \ + _partitionedTimers.add_timer(EXPLICIT_TIMER_##name, getExplicitTimer(EXPLICIT_TIMER_##name)); KMP_FOREACH_EXPLICIT_TIMER(doInit,0); #undef doInit } @@ -488,6 +602,7 @@ inline timeStat * getTimer(timer_e idx) { return &_timers[idx]; } inline counter * getCounter(counter_e idx) { return &_counters[idx]; } inline explicitTimer * getExplicitTimer(explicit_timer_e idx) { return &_explicitTimers[idx]; } + inline partitionedTimers * getPartitionedTimers() { return &_partitionedTimers; } inline timeStat * getTimers() { return _timers; } inline counter * getCounters() { return _counters; } inline explicitTimer * getExplicitTimers() { return _explicitTimers; } @@ -498,6 +613,12 @@ inline void decrementNestValue() { _nestLevel--; } inline int getGtid() const { return gtid; } inline void setGtid(int newgtid) { gtid = newgtid; } + inline void setState(stats_state_e newstate) { state = newstate; } + inline stats_state_e getState() const { return state; } + inline stats_state_e * getStatePointer() { return &state; } + inline bool isIdle() { return thread_is_idle_flag==1; } + inline void setIdleFlag() { thread_is_idle_flag = 1; } + inline void resetIdleFlag() { thread_is_idle_flag = 0; } kmp_stats_list* push_back(int gtid); // returns newly created list node inline void push_event(uint64_t start_time, uint64_t stop_time, int nest_level, timer_e name) { _event_vector.push_back(start_time, stop_time, nest_level, name); @@ -699,6 +820,35 @@ __kmp_output_stats(heading_string) /*! + * \brief Initializes the paritioned timers to begin with name. + * + * @param name timer which you want this thread to begin with + * + * @ingroup STATS_GATHERING +*/ +#define KMP_INIT_PARTITIONED_TIMERS(name) \ + __kmp_stats_thread_ptr->getPartitionedTimers()->init(timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + +#define KMP_TIME_PARTITIONED_BLOCK(name) \ + blockPartitionedTimer __PBLOCKTIME__(__kmp_stats_thread_ptr->getPartitionedTimers(), \ + timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + +#define KMP_PUSH_PARTITIONED_TIMER(name) \ + __kmp_stats_thread_ptr->getPartitionedTimers()->push(timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + +#define KMP_POP_PARTITIONED_TIMER() \ + __kmp_stats_thread_ptr->getPartitionedTimers()->pop() + +#define KMP_SET_THREAD_STATE(state_name) \ + __kmp_stats_thread_ptr->setState(state_name) + +#define KMP_GET_THREAD_STATE() \ + __kmp_stats_thread_ptr->getState() + +#define KMP_SET_THREAD_STATE_BLOCK(state_name) \ + blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), state_name) + +/*! * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) * * \details Reset all stats for all threads. @@ -739,6 +889,13 @@ #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) +#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) +#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) +#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) +#define KMP_POP_PARTITIONED_TIMER() ((void)0) +#define KMP_SET_THREAD_STATE(state_name) ((void)0) +#define KMP_GET_THREAD_STATE() ((void)0) +#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) #endif // KMP_STATS_ENABLED #endif // KMP_STATS_H Index: openmp/trunk/runtime/src/kmp_stats.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_stats.cpp +++ openmp/trunk/runtime/src/kmp_stats.cpp @@ -157,6 +157,7 @@ void explicitTimer::start(timer_e timerEnumValue) { startTime = tsc_tick_count::now(); + totalPauseTime = 0; if(timeStat::logEvent(timerEnumValue)) { __kmp_stats_thread_ptr->incrementNestValue(); } @@ -170,7 +171,7 @@ tsc_tick_count finishTime = tsc_tick_count::now(); //stat->addSample ((tsc_tick_count::now() - startTime).ticks()); - stat->addSample ((finishTime - startTime).ticks()); + stat->addSample(((finishTime - startTime) - totalPauseTime).ticks()); if(timeStat::logEvent(timerEnumValue)) { __kmp_stats_thread_ptr->push_event(startTime.getValue() - __kmp_stats_start_time.getValue(), finishTime.getValue() - __kmp_stats_start_time.getValue(), __kmp_stats_thread_ptr->getNestValue(), timerEnumValue); @@ -182,6 +183,74 @@ return; } +/* ************************************************************** */ +/* ************* partitionedTimers member functions ************* */ +partitionedTimers::partitionedTimers() { + timer_stack.reserve(8); +} + +// add a timer to this collection of partitioned timers. +void partitionedTimers::add_timer(explicit_timer_e timer_index, explicitTimer* timer_pointer) { + KMP_DEBUG_ASSERT((int)timer_index < (int)EXPLICIT_TIMER_LAST+1); + timers[timer_index] = timer_pointer; +} + +// initialize the paritioned timers to an initial timer +void partitionedTimers::init(timerPair init_timer_pair) { + KMP_DEBUG_ASSERT(this->timer_stack.size() == 0); + timer_stack.push_back(init_timer_pair); + timers[init_timer_pair.get_index()]->start(init_timer_pair.get_timer()); +} + +// stop/save the current timer, and start the new timer (timer_pair) +// There is a special condition where if the current timer is equal to +// the one you are trying to push, then it only manipulates the stack, +// and it won't stop/start the currently running timer. +void partitionedTimers::push(timerPair timer_pair) { + // get the current timer + // stop current timer + // push new timer + // start the new timer + KMP_DEBUG_ASSERT(this->timer_stack.size() > 0); + timerPair current_timer = timer_stack.back(); + timer_stack.push_back(timer_pair); + if(current_timer != timer_pair) { + timers[current_timer.get_index()]->pause(); + timers[timer_pair.get_index()]->start(timer_pair.get_timer()); + } +} + +// stop/discard the current timer, and start the previously saved timer +void partitionedTimers::pop() { + // get the current timer + // stop current timer + // pop current timer + // get the new current timer and start it back up + KMP_DEBUG_ASSERT(this->timer_stack.size() > 1); + timerPair current_timer = timer_stack.back(); + timer_stack.pop_back(); + timerPair new_timer = timer_stack.back(); + if(current_timer != new_timer) { + timers[current_timer.get_index()]->stop(current_timer.get_timer()); + timers[new_timer.get_index()]->resume(); + } +} + +// Wind up all the currently running timers. +// This pops off all the timers from the stack and clears the stack +// After this is called, init() must be run again to initialize the +// stack of timers +void partitionedTimers::windup() { + while(timer_stack.size() > 1) { + this->pop(); + } + if(timer_stack.size() > 0) { + timerPair last_timer = timer_stack.back(); + timer_stack.pop_back(); + timers[last_timer.get_index()]->stop(last_timer.get_timer()); + } +} + /* ******************************************************************* */ /* ************* kmp_stats_event_vector member functions ************* */ @@ -397,8 +466,10 @@ // If the timer wasn't running, this won't record anything anyway. kmp_stats_list::iterator it; for(it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) { + kmp_stats_list* ptr = *it; + ptr->getPartitionedTimers()->windup(); for (int timer=0; timergetExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer); + ptr->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer); } } } @@ -595,11 +666,7 @@ // reset the event vector so all previous events are "erased" (*it)->resetEventVector(); - - // May need to restart the explicit timers in thread zero? } - KMP_START_EXPLICIT_TIMER(OMP_serial); - KMP_START_EXPLICIT_TIMER(OMP_start_end); } // This function will reset all stats and stop all threads' explicit timers if they haven't been stopped already. Index: openmp/trunk/runtime/src/kmp_stats_timing.h =================================================================== --- openmp/trunk/runtime/src/kmp_stats_timing.h +++ openmp/trunk/runtime/src/kmp_stats_timing.h @@ -40,11 +40,16 @@ #endif double ticks() const { return double(value); } int64_t getValue() const { return value; } + tsc_interval_t& operator=(int64_t nvalue) { value = nvalue; return *this; } friend class tsc_tick_count; - friend tsc_interval_t operator-( - const tsc_tick_count t1, const tsc_tick_count t0); + friend tsc_interval_t operator-(const tsc_tick_count& t1, + const tsc_tick_count& t0); + friend tsc_interval_t operator-(const tsc_tick_count::tsc_interval_t& i1, + const tsc_tick_count::tsc_interval_t& i0); + friend tsc_interval_t& operator+=(tsc_tick_count::tsc_interval_t& i1, + const tsc_tick_count::tsc_interval_t& i0); }; #if KMP_HAVE___BUILTIN_READCYCLECOUNTER @@ -66,14 +71,25 @@ static double tick_time(); // returns seconds per cycle (period) of clock #endif static tsc_tick_count now() { return tsc_tick_count(); } // returns the rdtsc register value - friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0); + friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count& t1, const tsc_tick_count& t0); }; -inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0) +inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count& t1, const tsc_tick_count& t0) { return tsc_tick_count::tsc_interval_t( t1.my_count-t0.my_count ); } +inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count::tsc_interval_t& i1, const tsc_tick_count::tsc_interval_t& i0) +{ + return tsc_tick_count::tsc_interval_t( i1.value-i0.value ); +} + +inline tsc_tick_count::tsc_interval_t& operator+=(tsc_tick_count::tsc_interval_t& i1, const tsc_tick_count::tsc_interval_t& i0) +{ + i1.value += i0.value; + return i1; +} + #if KMP_HAVE_TICK_TIME inline double tsc_tick_count::tsc_interval_t::seconds() const { Index: openmp/trunk/runtime/src/kmp_tasking.c =================================================================== --- openmp/trunk/runtime/src/kmp_tasking.c +++ openmp/trunk/runtime/src/kmp_tasking.c @@ -36,16 +36,6 @@ static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask ); #endif -static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) { - if (!flag) return; - // Attempt to wake up a thread: examine its type and call appropriate template - switch (((kmp_flag_64 *)flag)->get_type()) { - case flag32: __kmp_resume_32(gtid, NULL); break; - case flag64: __kmp_resume_64(gtid, NULL); break; - case flag_oncore: __kmp_resume_oncore(gtid, NULL); break; - } -} - #ifdef BUILD_TIED_TASK_STACK //--------------------------------------------------------------------------- @@ -1207,8 +1197,17 @@ // Thunks generated by gcc take a different argument list. // if (!discard) { +#if KMP_STATS_ENABLED KMP_COUNT_BLOCK(TASK_executed); - KMP_TIME_BLOCK (OMP_task); + switch(KMP_GET_THREAD_STATE()) { + case FORK_JOIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); break; + case PLAIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); break; + case TASKYIELD: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); break; + case TASKWAIT: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); break; + case TASKGROUP: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); break; + default: KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); break; + } +#endif // KMP_STATS_ENABLED #endif // OMP_40_ENABLED #if OMPT_SUPPORT && OMPT_TRACE @@ -1231,6 +1230,7 @@ { (*(task->routine))(gtid, task); } + KMP_POP_PARTITIONED_TIMER(); #if OMPT_SUPPORT && OMPT_TRACE /* let OMPT know that we're returning to the callee task */ @@ -1369,6 +1369,7 @@ __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task) { kmp_int32 res; + KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); #if KMP_DEBUG kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task); @@ -1392,6 +1393,7 @@ kmp_taskdata_t * taskdata; kmp_info_t * thread; int thread_finished = FALSE; + KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref) ); @@ -1481,6 +1483,7 @@ int thread_finished = FALSE; KMP_COUNT_BLOCK(OMP_TASKYIELD); + KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", gtid, loc_ref, end_part) ); @@ -1561,6 +1564,7 @@ KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc) ); KMP_DEBUG_ASSERT( taskgroup != NULL ); + KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); if ( __kmp_tasking_mode != tskm_immediate_exec ) { #if USE_ITT_BUILD Index: openmp/trunk/runtime/src/kmp_wait_release.h =================================================================== --- openmp/trunk/runtime/src/kmp_wait_release.h +++ openmp/trunk/runtime/src/kmp_wait_release.h @@ -18,6 +18,7 @@ #include "kmp.h" #include "kmp_itt.h" +#include "kmp_stats.h" /*! @defgroup WAIT_RELEASE Wait/Release operations @@ -104,6 +105,9 @@ } th_gtid = this_thr->th.th_info.ds.ds_gtid; KA_TRACE(20, ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag)); +#if KMP_STATS_ENABLED + stats_state_e thread_state = KMP_GET_THREAD_STATE(); +#endif #if OMPT_SUPPORT && OMPT_BLAME ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; @@ -223,6 +227,15 @@ } } +#if KMP_STATS_ENABLED + // Check if thread has been signalled to idle state + // This indicates that the logical "join-barrier" has finished + if (this_thr->th.th_stats->isIdle() && KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) { + KMP_SET_THREAD_STATE(IDLE); + KMP_PUSH_PARTITIONED_TIMER(OMP_idle); + } +#endif + // Don't suspend if KMP_BLOCKTIME is set to "infinite" if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) continue; @@ -273,6 +286,14 @@ } } #endif +#if KMP_STATS_ENABLED + // If we were put into idle state, pop that off the state stack + if (KMP_GET_THREAD_STATE() == IDLE) { + KMP_POP_PARTITIONED_TIMER(); + KMP_SET_THREAD_STATE(thread_state); + this_thr->th.th_stats->resetIdleFlag(); + } +#endif KMP_FSYNC_SPIN_ACQUIRED(spin); } @@ -556,6 +577,15 @@ flag_type get_ptr_type() { return flag_oncore; } }; +// Used to wake up threads, volatile void* flag is usually the th_sleep_loc associated +// with int gtid. +static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) { + switch (((kmp_flag_64 *)flag)->get_type()) { + case flag32: __kmp_resume_32(gtid, NULL); break; + case flag64: __kmp_resume_64(gtid, NULL); break; + case flag_oncore: __kmp_resume_oncore(gtid, NULL); break; + } +} /*! @} Index: openmp/trunk/runtime/src/z_Linux_util.c =================================================================== --- openmp/trunk/runtime/src/z_Linux_util.c +++ openmp/trunk/runtime/src/z_Linux_util.c @@ -697,6 +697,9 @@ #if KMP_STATS_ENABLED // set __thread local index to point to thread-specific stats __kmp_stats_thread_ptr = ((kmp_info_t*)thr)->th.th_stats; + KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life); + KMP_SET_THREAD_STATE(IDLE); + KMP_INIT_PARTITIONED_TIMERS(OMP_idle); #endif #if USE_ITT_BUILD @@ -972,8 +975,9 @@ __kmp_stats_start_time = tsc_tick_count::now(); __kmp_stats_thread_ptr = th->th.th_stats; __kmp_stats_init(); - KMP_START_EXPLICIT_TIMER(OMP_serial); - KMP_START_EXPLICIT_TIMER(OMP_start_end); + KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life); + KMP_SET_THREAD_STATE(SERIAL_REGION); + KMP_INIT_PARTITIONED_TIMERS(OMP_serial); } __kmp_release_tas_lock(&__kmp_stats_lock, gtid); @@ -1856,6 +1860,7 @@ void __kmp_resume_monitor() { + KMP_TIME_DEVELOPER_BLOCK(USER_resume); int status; #ifdef KMP_DEBUG int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;