Index: openmp/trunk/runtime/src/kmp_csupport.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_csupport.cpp +++ openmp/trunk/runtime/src/kmp_csupport.cpp @@ -262,6 +262,14 @@ int gtid = __kmp_entry_gtid(); #if (KMP_STATS_ENABLED) + // If we were in a serial region, then stop the serial timer, record + // the event, and start parallel region timer + stats_state_e previous_state = KMP_GET_THREAD_STATE(); + if (previous_state == stats_state_e::SERIAL_REGION) { + KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead); + } else { + KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead); + } int inParallel = __kmpc_in_parallel(loc); if (inParallel) { KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); @@ -318,6 +326,14 @@ va_end(ap); } + +#if KMP_STATS_ENABLED + if (previous_state == stats_state_e::SERIAL_REGION) { + KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); + } else { + KMP_POP_PARTITIONED_TIMER(); + } +#endif // KMP_STATS_ENABLED } #if OMP_40_ENABLED @@ -1115,8 +1131,6 @@ __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); #else KMP_COUNT_BLOCK(OMP_CRITICAL); - KMP_TIME_PARTITIONED_BLOCK( - OMP_critical_wait); /* Time spent waiting to enter the critical section */ #if OMPT_SUPPORT && OMPT_OPTIONAL omp_state_t prev_state = omp_state_undefined; ompt_thread_info_t ti; @@ -1127,6 +1141,7 @@ // TODO: add THR_OVHD_STATE + KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); KMP_CHECK_USER_LOCK_INIT(); if ((__kmp_user_lock_kind == lk_tas) && @@ -1193,8 +1208,9 @@ } } #endif + KMP_POP_PARTITIONED_TIMER(); - KMP_START_EXPLICIT_TIMER(OMP_critical); + KMP_PUSH_PARTITIONED_TIMER(OMP_critical); KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); #endif // KMP_USE_DYNAMIC_LOCK } @@ -1345,6 +1361,7 @@ kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; // Check if it is initialized. + KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); if (*lk == 0) { kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); if (KMP_IS_D_LOCK(lckseq)) { @@ -1422,6 +1439,7 @@ #endif KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); } + KMP_POP_PARTITIONED_TIMER(); #if USE_ITT_BUILD __kmp_itt_critical_acquired(lck); @@ -1753,6 +1771,7 @@ Mark the end of a statically scheduled loop. */ void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { + KMP_POP_PARTITIONED_TIMER(); KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); #if OMPT_SUPPORT && OMPT_OPTIONAL @@ -1779,7 +1798,6 @@ &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); } #endif - if (__kmp_env_consistency_check) __kmp_pop_workshare(global_tid, ct_pdo, loc); } Index: openmp/trunk/runtime/src/kmp_dispatch.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_dispatch.cpp +++ openmp/trunk/runtime/src/kmp_dispatch.cpp @@ -321,7 +321,7 @@ ntc = (tc % chunk ? 1 : 0) + tc / chunk; if (nproc > 1 && ntc >= nproc) { - KMP_COUNT_BLOCK(OMP_FOR_static_steal); + KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); T id = tid; T small_chunk, extras; @@ -770,6 +770,15 @@ active = !team->t.t_serialized; th->th.th_ident = loc; + // Any half-decent optimizer will remove this test when the blocks are empty + // since the macros expand to nothing + // when statistics are disabled. + if (schedule == __kmp_static) { + KMP_COUNT_BLOCK(OMP_LOOP_STATIC); + } else { + KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); + } + #if KMP_USE_HIER_SCHED // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable // Hierarchical scheduling does not work with ordered, so if ordered is @@ -846,17 +855,6 @@ } } - // Any half-decent optimizer will remove this test when the blocks are empty - // since the macros expand to nothing - // when statistics are disabled. - if (schedule == __kmp_static) { - KMP_COUNT_BLOCK(OMP_FOR_static); - KMP_COUNT_VALUE(FOR_static_iterations, pr->u.p.tc); - } else { - KMP_COUNT_BLOCK(OMP_FOR_dynamic); - KMP_COUNT_VALUE(FOR_dynamic_iterations, pr->u.p.tc); - } - if (active) { /* The name of this buffer should be my_buffer_index when it's free to use * it */ @@ -962,6 +960,7 @@ &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); } #endif + KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); } /* For ordered loops, either __kmp_dispatch_finish() should be called after @@ -1229,11 +1228,11 @@ // by 1 if (remaining > 3) { // steal 1/4 of remaining - KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2); + KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); init = (victim->u.p.ub -= (remaining >> 2)); } else { // steal 1 chunk of 2 or 3 remaining - KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); + KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); init = (victim->u.p.ub -= 1); } __kmp_release_lock(lck, gtid); @@ -1333,7 +1332,8 @@ *VOLATILE_CAST(kmp_int64 *) & vold.b, *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { // stealing succedded - KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub); + KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, + vold.p.ub - vnew.p.ub); status = 1; while_index = 0; // now update own count and ub @@ -1361,7 +1361,7 @@ init *= chunk; limit = chunk + init - 1; incr = pr->u.p.st; - KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); + KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); KMP_DEBUG_ASSERT(init <= trip); if ((last = (limit >= trip)) != 0) @@ -1823,6 +1823,38 @@ #define OMPT_LOOP_END // no-op #endif +#if KMP_STATS_ENABLED +#define KMP_STATS_LOOP_END \ + { \ + kmp_int64 u, l, t, i; \ + l = (kmp_int64)(*p_lb); \ + u = (kmp_int64)(*p_ub); \ + i = (kmp_int64)(pr->u.p.st); \ + if (status == 0) { \ + t = 0; \ + KMP_POP_PARTITIONED_TIMER(); \ + } else if (i == 1) { \ + if (u >= l) \ + t = u - l + 1; \ + else \ + t = 0; \ + } else if (i < 0) { \ + if (l >= u) \ + t = (l - u) / (-i) + 1; \ + else \ + t = 0; \ + } else { \ + if (u >= l) \ + t = (u - l) / i + 1; \ + else \ + t = 0; \ + } \ + KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ + } +#else +#define KMP_STATS_LOOP_END /* Nothing */ +#endif + template static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, @@ -1840,7 +1872,7 @@ // even if the actual runtme schedule is static. (Which points out a // disadavantage of schedule(runtime): even when static scheduling is used it // costs more than a compile time choice to use static scheduling would.) - KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); + KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); int status; dispatch_private_info_template *pr; @@ -1964,6 +1996,7 @@ SSC_MARK_DISPATCH_NEXT(); #endif OMPT_LOOP_END; + KMP_STATS_LOOP_END; return status; } else { kmp_int32 last = 0; @@ -2081,6 +2114,7 @@ SSC_MARK_DISPATCH_NEXT(); #endif OMPT_LOOP_END; + KMP_STATS_LOOP_END; return status; } Index: openmp/trunk/runtime/src/kmp_runtime.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_runtime.cpp +++ openmp/trunk/runtime/src/kmp_runtime.cpp @@ -1644,12 +1644,8 @@ KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, parent_team->t.t_id, parent_team->t.t_pkfn)); - { - KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); - KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); - if (!parent_team->t.t_invoke(gtid)) { - KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); - } + if (!parent_team->t.t_invoke(gtid)) { + KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); } KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, parent_team->t.t_id, parent_team->t.t_pkfn)); @@ -1839,11 +1835,7 @@ // because initial code in teams should have level=0 team->t.t_level--; // AC: call special invoker for outer "parallel" of teams construct - { - KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); - KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); - invoker(gtid); - } + invoker(gtid); } else { #endif /* OMP_40_ENABLED */ argv = args; @@ -2256,12 +2248,8 @@ team->t.t_id, team->t.t_pkfn)); } // END of timer KMP_fork_call block - { - KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); - KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); - if (!team->t.t_invoke(gtid)) { - KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); - } + if (!team->t.t_invoke(gtid)) { + KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); } KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, team->t.t_id, team->t.t_pkfn)); @@ -3700,7 +3688,7 @@ #if KMP_STATS_ENABLED // Initialize stats as soon as possible (right after gtid assignment). __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); - KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life); + __kmp_stats_thread_ptr->startLife(); KMP_SET_THREAD_STATE(SERIAL_REGION); KMP_INIT_PARTITIONED_TIMERS(OMP_serial); #endif @@ -5639,11 +5627,7 @@ } #endif - { - KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); - KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); - rc = (*pteam)->t.t_invoke(gtid); - } + rc = (*pteam)->t.t_invoke(gtid); KMP_ASSERT(rc); KMP_MB(); Index: openmp/trunk/runtime/src/kmp_sched.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_sched.cpp +++ openmp/trunk/runtime/src/kmp_sched.cpp @@ -51,8 +51,9 @@ void *codeptr #endif ) { - KMP_COUNT_BLOCK(OMP_FOR_static); - KMP_TIME_PARTITIONED_BLOCK(FOR_static_scheduling); + KMP_COUNT_BLOCK(OMP_LOOP_STATIC); + KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static); + KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling); typedef typename traits_t::unsigned_t UT; typedef typename traits_t::signed_t ST; @@ -151,7 +152,6 @@ &(task_info->task_data), 0, codeptr); } #endif - KMP_COUNT_VALUE(FOR_static_iterations, 0); return; } @@ -254,7 +254,6 @@ loc); } } - KMP_COUNT_VALUE(FOR_static_iterations, trip_count); /* compute remaining parameters */ switch (schedtype) { @@ -390,6 +389,26 @@ } #endif +#if KMP_STATS_ENABLED + { + kmp_int64 t; + kmp_int64 u = (kmp_int64)(*pupper); + kmp_int64 l = (kmp_int64)(*plower); + kmp_int64 i = (kmp_int64)incr; + /* compute trip count */ + if (i == 1) { + t = u - l + 1; + } else if (i == -1) { + t = l - u + 1; + } else if (i > 0) { + t = (u - l) / i + 1; + } else { + t = (l - u) / (-i) + 1; + } + KMP_COUNT_VALUE(OMP_loop_static_iterations, t); + KMP_POP_PARTITIONED_TIMER(); + } +#endif return; } Index: openmp/trunk/runtime/src/kmp_stats.h =================================================================== --- openmp/trunk/runtime/src/kmp_stats.h +++ openmp/trunk/runtime/src/kmp_stats.h @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "kmp_config.h" +#include "kmp_debug.h" #if KMP_STATS_ENABLED /* Statistics accumulator. @@ -36,7 +37,10 @@ /* Enable developer statistics here if you want them. They are more detailed than is useful for application characterisation and are intended for the runtime library developer. */ -// #define KMP_DEVELOPER_STATS 1 +#define KMP_DEVELOPER_STATS 0 + +/* Enable/Disable histogram output */ +#define KMP_STATS_HIST 0 /*! * @ingroup STATS_GATHERING @@ -91,9 +95,9 @@ #define KMP_FOREACH_COUNTER(macro, arg) \ macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \ macro(OMP_NESTED_PARALLEL, 0, arg) \ - macro(OMP_FOR_static, 0, arg) \ - macro(OMP_FOR_static_steal, 0, arg) \ - macro(OMP_FOR_dynamic, 0, arg) \ + macro(OMP_LOOP_STATIC, 0, arg) \ + macro(OMP_LOOP_STATIC_STEAL, 0, arg) \ + macro(OMP_LOOP_DYNAMIC, 0, arg) \ macro(OMP_DISTRIBUTE, 0, arg) \ macro(OMP_BARRIER, 0, arg) \ macro(OMP_CRITICAL, 0, arg) \ @@ -119,7 +123,7 @@ * @param arg a user defined argument to send to the user defined macro * * \details A timer collects multiple samples of some count in each thread and - * then finally aggregates alll of the samples from all of the threads. For most + * then finally aggregates all of the samples from all of the threads. For most * timers the printing code also provides an aggregation over the thread totals. * These are printed as TOTAL_foo. The count is normally a time (in ticks), * hence the name "timer". (But can be any value, so we use this for "number of @@ -131,55 +135,59 @@ */ // clang-format off #define KMP_FOREACH_TIMER(macro, arg) \ - macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ - macro (FOR_static_scheduling, 0, arg) \ - macro (FOR_dynamic_scheduling, 0, arg) \ - macro (OMP_critical, 0, arg) \ - macro (OMP_critical_wait, 0, arg) \ - macro (OMP_single, 0, arg) \ - macro (OMP_master, 0, arg) \ - macro (OMP_idle, stats_flags_e::logEvent, arg) \ - macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ - macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ - macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ - macro (OMP_parallel, stats_flags_e::logEvent, arg) \ - macro (OMP_task_immediate, 0, arg) \ - macro (OMP_task_taskwait, 0, arg) \ - macro (OMP_task_taskyield, 0, arg) \ - macro (OMP_task_taskgroup, 0, arg) \ - macro (OMP_task_join_bar, 0, arg) \ - macro (OMP_task_plain_bar, 0, arg) \ - macro (OMP_serial, stats_flags_e::logEvent, arg) \ - macro (OMP_taskloop_scheduling, 0, arg) \ - macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,\ - arg) \ - macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ - arg) \ - macro (FOR_static_iterations, \ - stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ - macro (FOR_dynamic_iterations, \ - stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ - macro (FOR_static_steal_stolen, \ - stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ - macro (FOR_static_steal_chunks, \ - stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ - KMP_FOREACH_DEVELOPER_TIMER(macro, arg) + macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ + macro (OMP_parallel, stats_flags_e::logEvent, arg) \ + macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \ + macro (OMP_loop_static, 0, arg) \ + macro (OMP_loop_static_scheduling, 0, arg) \ + macro (OMP_loop_dynamic, 0, arg) \ + macro (OMP_loop_dynamic_scheduling, 0, arg) \ + macro (OMP_critical, 0, arg) \ + macro (OMP_critical_wait, 0, arg) \ + macro (OMP_single, 0, arg) \ + macro (OMP_master, 0, arg) \ + macro (OMP_task_immediate, 0, arg) \ + macro (OMP_task_taskwait, 0, arg) \ + macro (OMP_task_taskyield, 0, arg) \ + macro (OMP_task_taskgroup, 0, arg) \ + macro (OMP_task_join_bar, 0, arg) \ + macro (OMP_task_plain_bar, 0, arg) \ + macro (OMP_taskloop_scheduling, 0, arg) \ + macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ + macro (OMP_idle, stats_flags_e::logEvent, arg) \ + macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ + macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ + macro (OMP_serial, stats_flags_e::logEvent, arg) \ + macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \ + arg) \ + macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ + arg) \ + macro (OMP_loop_static_iterations, \ + stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ + macro (OMP_loop_dynamic_iterations, \ + stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ + KMP_FOREACH_DEVELOPER_TIMER(macro, arg) // clang-format on -// OMP_start_end -- Time from when OpenMP is initialized until the -// stats are printed at exit -// OMP_serial -- Thread zero time executing serial code -// OMP_work -- Elapsed time in code dispatched by a fork (measured -// in the thread) -// OMP_barrier -- Time at "real" barriers (includes task time) -// FOR_static_scheduling -- Time spent doing scheduling for a static "for" -// FOR_dynamic_scheduling -- Time spent doing scheduling for a dynamic "for" -// OMP_idle -- Worker threads time spent waiting for inclusion in -// a parallel region -// OMP_plain_barrier -- Time spent in a barrier construct -// OMP_fork_join_barrier -- Time spent in a the fork-join barrier surrounding a -// parallel region -// OMP_parallel -- Time spent inside a parallel construct +// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either +// initializing OpenMP or being created by a master) +// until the thread is destroyed +// OMP_parallel -- Time thread spends executing work directly +// within a #pragma omp parallel +// OMP_parallel_overhead -- Time thread spends setting up a parallel region +// OMP_loop_static -- Time thread spends executing loop iterations from +// a statically scheduled loop +// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations +// from a statically scheduled loop +// OMP_loop_dynamic -- Time thread spends executing loop iterations from +// a dynamically scheduled loop +// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations +// from a dynamically scheduled loop +// OMP_critical -- Time thread spends executing critical section +// OMP_critical_wait -- Time thread spends waiting to enter +// a critcal seciton +// OMP_single -- Time spent executing a "single" region +// OMP_master -- Time spent executing a "master" region // OMP_task_immediate -- Time spent executing non-deferred tasks // OMP_task_taskwait -- Time spent executing tasks inside a taskwait // construct @@ -190,17 +198,24 @@ // OMP_task_join_bar -- Time spent executing tasks inside a join barrier // OMP_task_plain_bar -- Time spent executing tasks inside a barrier // construct -// OMP_single -- Time spent executing a "single" region -// OMP_master -- Time spent executing a "master" region +// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop +// construct +// OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or +// inside implicit barrier at end of worksharing +// construct +// OMP_idle -- Time worker threads spend waiting for next +// parallel region +// OMP_fork_barrier -- Time spent in a the fork barrier surrounding a +// parallel region +// OMP_join_barrier -- Time spent in a the join barrier surrounding a +// parallel region +// OMP_serial -- Time thread zero spends executing serial code // OMP_set_numthreads -- Values passed to omp_set_num_threads // OMP_PARALLEL_args -- Number of arguments passed to a parallel region -// FOR_static_iterations -- Number of available parallel chunks of work in a -// static for -// FOR_dynamic_iterations -- Number of available parallel chunks of work in a -// dynamic for -// Both adjust for any chunking, so if there were an -// iteration count of 20 but a chunk size of 10, we'd -// record 2. +// OMP_loop_static_iterations -- Number of iterations thread is assigned for +// statically scheduled loops +// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for +// dynamically scheduled loops #if (KMP_DEVELOPER_STATS) // Timers which are of interest to runtime library developers, not end users. @@ -235,7 +250,11 @@ macro(USER_suspend, 0, arg) \ macro(KMP_allocate_team, 0, arg) \ macro(KMP_setup_icv_copy, 0, arg) \ - macro(USER_icv_copy, 0, arg) + macro(USER_icv_copy, 0, arg) \ + macro (FOR_static_steal_stolen, \ + stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ + macro (FOR_static_steal_chunks, \ + stats_flags_e::noUnits | stats_flags_e::noTotal, arg) #else #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) #endif @@ -272,53 +291,143 @@ enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST }; #undef ENUMERATE -class timerPair { - explicit_timer_e timer_index; - timer_e timer; +/* + * A logarithmic histogram. It accumulates the number of values in each power of + * ten bin. So 1<=x<10, 10<=x<100, ... + * Mostly useful where we have some big outliers and want to see information + * about them. + */ +class logHistogram { + enum { + numBins = 31, /* Number of powers of 10. If this changes you need to change + * the initializer for binMax */ + + /* + * If you want to use this to analyse values that may be less than 1, (for + * instance times in s), then the logOffset gives you negative powers. + * In our case here, we're just looking at times in ticks, or counts, so we + * can never see values with magnitude < 1 (other than zero), so we can set + * it to 0. As above change the initializer if you change this. + */ + logOffset = 0 + }; + uint32_t KMP_ALIGN_CACHE zeroCount; + struct { + uint32_t count; + double total; + } bins[numBins]; + + static double binMax[numBins]; + +#ifdef KMP_DEBUG + uint64_t _total; + + void check() const { + uint64_t t = zeroCount; + for (int i = 0; i < numBins; i++) + t += bins[i].count; + KMP_DEBUG_ASSERT(t == _total); + } +#else + void check() const {} +#endif public: - timerPair(explicit_timer_e ti, timer_e t) : timer_index(ti), timer(t) {} - inline explicit_timer_e get_index() const { return timer_index; } - inline timer_e get_timer() const { return timer; } - bool operator==(const timerPair &rhs) { - return this->get_index() == rhs.get_index(); + logHistogram() { reset(); } + + logHistogram(logHistogram const &o) { + for (int i = 0; i < numBins; i++) + bins[i] = o.bins[i]; +#ifdef KMP_DEBUG + _total = o._total; +#endif } - bool operator!=(const timerPair &rhs) { return !(*this == rhs); } + + void reset() { + zeroCount = 0; + for (int i = 0; i < numBins; i++) { + bins[i].count = 0; + bins[i].total = 0; + } + +#ifdef KMP_DEBUG + _total = 0; +#endif + } + uint32_t count(int b) const { return bins[b + logOffset].count; } + double total(int b) const { return bins[b + logOffset].total; } + static uint32_t findBin(double sample); + + logHistogram &operator+=(logHistogram const &o) { + zeroCount += o.zeroCount; + for (int i = 0; i < numBins; i++) { + bins[i].count += o.bins[i].count; + bins[i].total += o.bins[i].total; + } +#ifdef KMP_DEBUG + _total += o._total; + check(); +#endif + + return *this; + } + + void addSample(double sample); + int minBin() const; + int maxBin() const; + + std::string format(char) const; }; class statistic { - double minVal; + double KMP_ALIGN_CACHE minVal; double maxVal; double meanVal; double m2; uint64_t sampleCount; + double offset; + bool collectingHist; + logHistogram hist; public: - statistic() { reset(); } + statistic(bool doHist = bool(KMP_STATS_HIST)) { + reset(); + collectingHist = doHist; + } statistic(statistic const &o) : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), - sampleCount(o.sampleCount) {} - + sampleCount(o.sampleCount), offset(o.offset), + collectingHist(o.collectingHist), hist(o.hist) {} + statistic(double minv, double maxv, double meanv, uint64_t sc, double sd) + : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc), + sampleCount(sc), offset(0.0), collectingHist(false) {} + bool haveHist() const { return collectingHist; } double getMin() const { return minVal; } double getMean() const { return meanVal; } double getMax() const { return maxVal; } uint64_t getCount() const { return sampleCount; } double getSD() const { return sqrt(m2 / sampleCount); } double getTotal() const { return sampleCount * meanVal; } + logHistogram const *getHist() const { return &hist; } + void setOffset(double d) { offset = d; } void reset() { minVal = std::numeric_limits::max(); - maxVal = -std::numeric_limits::max(); + maxVal = -minVal; meanVal = 0.0; m2 = 0.0; sampleCount = 0; + offset = 0.0; + hist.reset(); } void addSample(double sample); void scale(double factor); void scaleDown(double f) { scale(1. / f); } + void forceCount(uint64_t count) { sampleCount = count; } statistic &operator+=(statistic const &other); std::string format(char unit, bool total = false) const; + std::string formatHist(char unit) const { return hist.format(unit); } }; struct statInfo { @@ -359,39 +468,29 @@ // to live on the stack of the thread, they're more work to use. class explicitTimer { timeStat *stat; + timer_e timerEnumValue; tsc_tick_count startTime; tsc_tick_count pauseStartTime; tsc_tick_count::tsc_interval_t totalPauseTime; public: - explicitTimer() - : stat(0), startTime(0), pauseStartTime(0), totalPauseTime() {} - explicitTimer(timeStat *s) - : stat(s), startTime(), pauseStartTime(0), totalPauseTime() {} - - void setStat(timeStat *s) { stat = s; } - void start(timer_e timerEnumValue); - void pause() { pauseStartTime = tsc_tick_count::now(); } - void resume() { totalPauseTime += (tsc_tick_count::now() - pauseStartTime); } - void stop(timer_e timerEnumValue, kmp_stats_list *stats_ptr = nullptr); + explicitTimer(timeStat *s, timer_e te) + : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0), + totalPauseTime() {} + + // void setStat(timeStat *s) { stat = s; } + void start(tsc_tick_count tick); + void pause(tsc_tick_count tick) { pauseStartTime = tick; } + void resume(tsc_tick_count tick) { + totalPauseTime += (tick - pauseStartTime); + } + void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr); void reset() { startTime = 0; pauseStartTime = 0; totalPauseTime = 0; } -}; - -// Where all you need is to time a block, this is enough. -// (It avoids the need to have an explicit end, leaving the scope suffices.) -class blockTimer : public explicitTimer { - timer_e timerEnumValue; - -public: - blockTimer(timeStat *s, timer_e newTimerEnumValue) - : timerEnumValue(newTimerEnumValue), explicitTimer(s) { - start(timerEnumValue); - } - ~blockTimer() { stop(timerEnumValue); } + timer_e get_type() const { return timerEnumValue; } }; // Where you need to partition a threads clock ticks into separate states @@ -402,14 +501,13 @@ // versa class partitionedTimers { private: - explicitTimer *timers[EXPLICIT_TIMER_LAST + 1]; - std::vector timer_stack; + std::vector timer_stack; public: partitionedTimers(); - void add_timer(explicit_timer_e timer_index, explicitTimer *timer_pointer); - void init(timerPair timer_index); - void push(timerPair timer_index); + void init(explicitTimer timer); + void exchange(explicitTimer timer); + void push(explicitTimer timer); void pop(); void windup(); }; @@ -418,12 +516,11 @@ // It avoids the need to have an explicit end, leaving the scope suffices. class blockPartitionedTimer { partitionedTimers *part_timers; - timerPair timer_pair; public: - blockPartitionedTimer(partitionedTimers *pt, timerPair tp) - : part_timers(pt), timer_pair(tp) { - part_timers->push(timer_pair); + blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer) + : part_timers(pt) { + part_timers->push(timer); } ~blockPartitionedTimer() { part_timers->pop(); } }; @@ -609,7 +706,7 @@ int gtid; timeStat _timers[TIMER_LAST + 1]; counter _counters[COUNTER_LAST + 1]; - explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST + 1]; + explicitTimer thread_life_timer; partitionedTimers _partitionedTimers; int _nestLevel; // one per thread kmp_stats_event_vector _event_vector; @@ -620,28 +717,21 @@ public: kmp_stats_list() - : _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), - thread_is_idle_flag(0) { -#define doInit(name, ignore1, ignore2) \ - getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name)); \ - _partitionedTimers.add_timer(EXPLICIT_TIMER_##name, \ - getExplicitTimer(EXPLICIT_TIMER_##name)); - KMP_FOREACH_EXPLICIT_TIMER(doInit, 0); -#undef doInit - } + : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life], + TIMER_OMP_worker_thread_life), + _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), + thread_is_idle_flag(0) {} ~kmp_stats_list() {} inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; } inline counter *getCounter(counter_e idx) { return &_counters[idx]; } - inline explicitTimer *getExplicitTimer(explicit_timer_e idx) { - return &_explicitTimers[idx]; - } inline partitionedTimers *getPartitionedTimers() { return &_partitionedTimers; } inline timeStat *getTimers() { return _timers; } inline counter *getCounters() { return _counters; } - inline explicitTimer *getExplicitTimers() { return _explicitTimers; } inline kmp_stats_event_vector &getEventVector() { return _event_vector; } + inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); } + inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); } inline void resetEventVector() { _event_vector.reset(); } inline void incrementNestValue() { _nestLevel++; } inline int getNestValue() { return _nestLevel; } @@ -774,22 +864,6 @@ // Simple, standard interfaces that drop out completely if stats aren't enabled /*! - * \brief Uses specified timer (name) to time code block. - * - * @param name timer name as specified under the KMP_FOREACH_TIMER() macro - * - * \details Use KMP_TIME_BLOCK(name) macro to time a code block. This will - * record the time taken in the block and use the destructor to stop the timer. - * Convenient! With this definition you can't have more than one KMP_TIME_BLOCK - * in the same code block. I don't think that's a problem. - * - * @ingroup STATS_GATHERING -*/ -#define KMP_TIME_BLOCK(name) \ - blockTimer __BLOCKTIME__(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ - TIMER_##name) - -/*! * \brief Adds value to specified timer (name). * * @param name timer name as specified under the KMP_FOREACH_TIMER() macro @@ -817,44 +891,6 @@ __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment() /*! - * \brief "Starts" an explicit timer which will need a corresponding - * KMP_STOP_EXPLICIT_TIMER() macro. - * - * @param name explicit timer name as specified under the - * KMP_FOREACH_EXPLICIT_TIMER() macro - * - * \details Use to start a timer. This will need a corresponding - * KMP_STOP_EXPLICIT_TIMER() macro to stop the timer unlike the - * KMP_TIME_BLOCK(name) macro which has an implicit stopping macro at the end - * of the code block. All explicit timers are stopped at library exit time - * before the final statistics are outputted. - * - * @ingroup STATS_GATHERING -*/ -#define KMP_START_EXPLICIT_TIMER(name) \ - __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name) \ - ->start(TIMER_##name) - -/*! - * \brief "Stops" an explicit timer. - * - * @param name explicit timer name as specified under the - * KMP_FOREACH_EXPLICIT_TIMER() macro - * - * \details Use KMP_STOP_EXPLICIT_TIMER(name) to stop a timer. When this is - * done, the time between the last KMP_START_EXPLICIT_TIMER(name) and this - * KMP_STOP_EXPLICIT_TIMER(name) will be added to the timer's stat value. The - * timer will then be reset. After the KMP_STOP_EXPLICIT_TIMER(name) macro is - * called, another call to KMP_START_EXPLICIT_TIMER(name) will start the timer - * once again. - * - * @ingroup STATS_GATHERING -*/ -#define KMP_STOP_EXPLICIT_TIMER(name) \ - __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name) \ - ->stop(TIMER_##name) - -/*! * \brief Outputs the current thread statistics and reset them. * * @param heading_string heading put above the final stats output @@ -881,21 +917,26 @@ * @ingroup STATS_GATHERING */ #define KMP_INIT_PARTITIONED_TIMERS(name) \ - __kmp_stats_thread_ptr->getPartitionedTimers()->init( \ - timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \ + __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) #define KMP_TIME_PARTITIONED_BLOCK(name) \ blockPartitionedTimer __PBLOCKTIME__( \ __kmp_stats_thread_ptr->getPartitionedTimers(), \ - timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ + TIMER_##name)) #define KMP_PUSH_PARTITIONED_TIMER(name) \ - __kmp_stats_thread_ptr->getPartitionedTimers()->push( \ - timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \ + __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) #define KMP_POP_PARTITIONED_TIMER() \ __kmp_stats_thread_ptr->getPartitionedTimers()->pop() +#define KMP_EXCHANGE_PARTITIONED_TIMER(name) \ + __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \ + __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) + #define KMP_SET_THREAD_STATE(state_name) \ __kmp_stats_thread_ptr->setState(state_name) Index: openmp/trunk/runtime/src/kmp_stats.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_stats.cpp +++ openmp/trunk/runtime/src/kmp_stats.cpp @@ -21,6 +21,7 @@ #include #include #include // for atexit +#include #define STRINGIZE2(x) #x #define STRINGIZE(x) STRINGIZE2(x) @@ -67,9 +68,18 @@ // output interface static kmp_stats_output_module *__kmp_stats_global_output = NULL; +double logHistogram::binMax[] = { + 1.e1l, 1.e2l, 1.e3l, 1.e4l, 1.e5l, 1.e6l, 1.e7l, 1.e8l, + 1.e9l, 1.e10l, 1.e11l, 1.e12l, 1.e13l, 1.e14l, 1.e15l, 1.e16l, + 1.e17l, 1.e18l, 1.e19l, 1.e20l, 1.e21l, 1.e22l, 1.e23l, 1.e24l, + 1.e25l, 1.e26l, 1.e27l, 1.e28l, 1.e29l, 1.e30l}; + /* ************* statistic member functions ************* */ void statistic::addSample(double sample) { + sample -= offset; + KMP_DEBUG_ASSERT(std::isfinite(sample)); + double delta = sample - meanVal; sampleCount = sampleCount + 1; @@ -78,9 +88,14 @@ minVal = std::min(minVal, sample); maxVal = std::max(maxVal, sample); + if (collectingHist) + hist.addSample(sample); } statistic &statistic::operator+=(const statistic &other) { + if (other.sampleCount == 0) + return *this; + if (sampleCount == 0) { *this = other; return *this; @@ -104,6 +119,8 @@ minVal = std::min(minVal, other.minVal); maxVal = std::max(maxVal, other.maxVal); sampleCount = newSampleCount; + if (collectingHist) + hist += other.hist; return *this; } @@ -138,10 +155,89 @@ return result; } +/* ************* histogram member functions ************* */ + +// Lowest bin that has anything in it +int logHistogram::minBin() const { + for (int i = 0; i < numBins; i++) { + if (bins[i].count != 0) + return i - logOffset; + } + return -logOffset; +} + +// Highest bin that has anything in it +int logHistogram::maxBin() const { + for (int i = numBins - 1; i >= 0; i--) { + if (bins[i].count != 0) + return i - logOffset; + } + return -logOffset; +} + +// Which bin does this sample belong in ? +uint32_t logHistogram::findBin(double sample) { + double v = std::fabs(sample); + // Simply loop up looking which bin to put it in. + // According to a micro-architect this is likely to be faster than a binary + // search, since + // it will only have one branch mis-predict + for (int b = 0; b < numBins; b++) + if (binMax[b] > v) + return b; + fprintf(stderr, + "Trying to add a sample that is too large into a histogram\n"); + KMP_ASSERT(0); + return -1; +} + +void logHistogram::addSample(double sample) { + if (sample == 0.0) { + zeroCount += 1; +#ifdef KMP_DEBUG + _total++; + check(); +#endif + return; + } + KMP_DEBUG_ASSERT(std::isfinite(sample)); + uint32_t bin = findBin(sample); + KMP_DEBUG_ASSERT(0 <= bin && bin < numBins); + + bins[bin].count += 1; + bins[bin].total += sample; +#ifdef KMP_DEBUG + _total++; + check(); +#endif +} + +// This may not be the format we want, but it'll do for now +std::string logHistogram::format(char unit) const { + std::stringstream result; + + result << "Bin, Count, Total\n"; + if (zeroCount) { + result << "0, " << formatSI(zeroCount, 9, ' ') << ", ", + formatSI(0.0, 9, unit); + if (count(minBin()) == 0) + return result.str(); + result << "\n"; + } + for (int i = minBin(); i <= maxBin(); i++) { + result << "10**" << i << "<=v<10**" << (i + 1) << ", " + << formatSI(count(i), 9, ' ') << ", " << formatSI(total(i), 9, unit); + if (i != maxBin()) + result << "\n"; + } + + return result.str(); +} + /* ************* explicitTimer member functions ************* */ -void explicitTimer::start(timer_e timerEnumValue) { - startTime = tsc_tick_count::now(); +void explicitTimer::start(tsc_tick_count tick) { + startTime = tick; totalPauseTime = 0; if (timeStat::logEvent(timerEnumValue)) { __kmp_stats_thread_ptr->incrementNestValue(); @@ -149,22 +245,19 @@ return; } -void explicitTimer::stop(timer_e timerEnumValue, +void explicitTimer::stop(tsc_tick_count tick, kmp_stats_list *stats_ptr /* = nullptr */) { if (startTime.getValue() == 0) return; - tsc_tick_count finishTime = tsc_tick_count::now(); - - // stat->addSample ((tsc_tick_count::now() - startTime).ticks()); - stat->addSample(((finishTime - startTime) - totalPauseTime).ticks()); + stat->addSample(((tick - startTime) - totalPauseTime).ticks()); if (timeStat::logEvent(timerEnumValue)) { if (!stats_ptr) stats_ptr = __kmp_stats_thread_ptr; stats_ptr->push_event( startTime.getValue() - __kmp_stats_start_time.getValue(), - finishTime.getValue() - __kmp_stats_start_time.getValue(), + tick.getValue() - __kmp_stats_start_time.getValue(), __kmp_stats_thread_ptr->getNestValue(), timerEnumValue); stats_ptr->decrementNestValue(); } @@ -178,52 +271,67 @@ /* ************* partitionedTimers member functions ************* */ partitionedTimers::partitionedTimers() { timer_stack.reserve(8); } -// add a timer to this collection of partitioned timers. -void partitionedTimers::add_timer(explicit_timer_e timer_index, - explicitTimer *timer_pointer) { - KMP_DEBUG_ASSERT((int)timer_index < (int)EXPLICIT_TIMER_LAST + 1); - timers[timer_index] = timer_pointer; -} - // initialize the paritioned timers to an initial timer -void partitionedTimers::init(timerPair init_timer_pair) { +void partitionedTimers::init(explicitTimer timer) { KMP_DEBUG_ASSERT(this->timer_stack.size() == 0); - timer_stack.push_back(init_timer_pair); - timers[init_timer_pair.get_index()]->start(init_timer_pair.get_timer()); + timer_stack.push_back(timer); + timer_stack.back().start(tsc_tick_count::now()); } // stop/save the current timer, and start the new timer (timer_pair) // There is a special condition where if the current timer is equal to // the one you are trying to push, then it only manipulates the stack, // and it won't stop/start the currently running timer. -void partitionedTimers::push(timerPair timer_pair) { +void partitionedTimers::push(explicitTimer timer) { // get the current timer - // stop current timer + // pause current timer // push new timer // start the new timer + explicitTimer *current_timer, *new_timer; + size_t stack_size; KMP_DEBUG_ASSERT(this->timer_stack.size() > 0); - timerPair current_timer = timer_stack.back(); - timer_stack.push_back(timer_pair); - if (current_timer != timer_pair) { - timers[current_timer.get_index()]->pause(); - timers[timer_pair.get_index()]->start(timer_pair.get_timer()); - } + timer_stack.push_back(timer); + stack_size = timer_stack.size(); + current_timer = &(timer_stack[stack_size - 2]); + new_timer = &(timer_stack[stack_size - 1]); + tsc_tick_count tick = tsc_tick_count::now(); + current_timer->pause(tick); + new_timer->start(tick); } // stop/discard the current timer, and start the previously saved timer void partitionedTimers::pop() { // get the current timer - // stop current timer + // stop current timer (record event/sample) // pop current timer - // get the new current timer and start it back up - KMP_DEBUG_ASSERT(this->timer_stack.size() > 1); - timerPair current_timer = timer_stack.back(); + // get the new current timer and resume + explicitTimer *old_timer, *new_timer; + size_t stack_size = timer_stack.size(); + KMP_DEBUG_ASSERT(stack_size > 1); + old_timer = &(timer_stack[stack_size - 1]); + new_timer = &(timer_stack[stack_size - 2]); + tsc_tick_count tick = tsc_tick_count::now(); + old_timer->stop(tick); + new_timer->resume(tick); timer_stack.pop_back(); - timerPair new_timer = timer_stack.back(); - if (current_timer != new_timer) { - timers[current_timer.get_index()]->stop(current_timer.get_timer()); - timers[new_timer.get_index()]->resume(); - } +} + +void partitionedTimers::exchange(explicitTimer timer) { + // get the current timer + // stop current timer (record event/sample) + // push new timer + // start the new timer + explicitTimer *current_timer, *new_timer; + size_t stack_size; + KMP_DEBUG_ASSERT(this->timer_stack.size() > 0); + tsc_tick_count tick = tsc_tick_count::now(); + stack_size = timer_stack.size(); + current_timer = &(timer_stack[stack_size - 1]); + current_timer->stop(tick); + timer_stack.pop_back(); + timer_stack.push_back(timer); + new_timer = &(timer_stack[stack_size - 1]); + new_timer->start(tick); } // Wind up all the currently running timers. @@ -234,10 +342,10 @@ while (timer_stack.size() > 1) { this->pop(); } + // Pop the timer from the init() call if (timer_stack.size() > 0) { - timerPair last_timer = timer_stack.back(); + timer_stack.back().stop(tsc_tick_count::now()); timer_stack.pop_back(); - timers[last_timer.get_index()]->stop(last_timer.get_timer()); } } @@ -255,8 +363,8 @@ // event2 or zero if event1 == event2. This sorts by start time (lowest to // highest). int compare_two_events(const void *event1, const void *event2) { - kmp_stats_event *ev1 = (kmp_stats_event *)event1; - kmp_stats_event *ev2 = (kmp_stats_event *)event2; + const kmp_stats_event *ev1 = RCAST(const kmp_stats_event *, event1); + const kmp_stats_event *ev2 = RCAST(const kmp_stats_event *, event2); if (ev1->getStart() < ev2->getStart()) return -1; @@ -353,9 +461,93 @@ int kmp_stats_output_module::printPerThreadFlag = 0; int kmp_stats_output_module::printPerThreadEventsFlag = 0; +static char const *lastName(char *name) { + int l = strlen(name); + for (int i = l - 1; i >= 0; --i) { + if (name[i] == '.') + name[i] = '_'; + if (name[i] == '/') + return name + i + 1; + } + return name; +} + +/* Read the name of the executable from /proc/self/cmdline */ +static char const *getImageName(char *buffer, size_t buflen) { + FILE *f = fopen("/proc/self/cmdline", "r"); + buffer[0] = char(0); + if (!f) + return buffer; + + // The file contains char(0) delimited words from the commandline. + // This just returns the last filename component of the first word on the + // line. + size_t n = fread(buffer, 1, buflen, f); + if (n == 0) { + fclose(f); + KMP_CHECK_SYSFAIL("fread", 1) + } + fclose(f); + buffer[buflen - 1] = char(0); + return lastName(buffer); +} + +static void getTime(char *buffer, size_t buflen, bool underscores = false) { + time_t timer; + + time(&timer); + + struct tm *tm_info = localtime(&timer); + if (underscores) + strftime(buffer, buflen, "%Y-%m-%d_%H%M%S", tm_info); + else + strftime(buffer, buflen, "%Y-%m-%d %H%M%S", tm_info); +} + +/* Generate a stats file name, expanding prototypes */ +static std::string generateFilename(char const *prototype, + char const *imageName) { + std::string res; + + for (int i = 0; prototype[i] != char(0); i++) { + char ch = prototype[i]; + + if (ch == '%') { + i++; + if (prototype[i] == char(0)) + break; + + switch (prototype[i]) { + case 't': // Insert time and date + { + char date[26]; + getTime(date, sizeof(date), true); + res += date; + } break; + case 'e': // Insert executable name + res += imageName; + break; + case 'p': // Insert pid + { + std::stringstream ss; + ss << getpid(); + res += ss.str(); + } break; + default: + res += prototype[i]; + break; + } + } else + res += ch; + } + return res; +} + // init() is called very near the beginning of execution time in the constructor // of __kmp_stats_global_output void kmp_stats_output_module::init() { + + fprintf(stderr, "*** Stats enabled OpenMP* runtime ***\n"); char *statsFileName = getenv("KMP_STATS_FILE"); eventsFileName = getenv("KMP_STATS_EVENTS_FILE"); plotFileName = getenv("KMP_STATS_PLOT_FILE"); @@ -364,22 +556,10 @@ // set the stats output filenames based on environment variables and defaults if (statsFileName) { - // append the process id to the output filename - // events.csv --> events-pid.csv - size_t index; - std::string baseFileName, pid, suffix; - std::stringstream ss; - outputFileName = std::string(statsFileName); - index = outputFileName.find_last_of('.'); - if (index == std::string::npos) { - baseFileName = outputFileName; - } else { - baseFileName = outputFileName.substr(0, index); - suffix = outputFileName.substr(index); - } - ss << getpid(); - pid = ss.str(); - outputFileName = baseFileName + "-" + pid + suffix; + char imageName[1024]; + // Process any escapes (e.g., %p, %e, %t) in the name + outputFileName = generateFilename( + statsFileName, getImageName(&imageName[0], sizeof(imageName))); } eventsFileName = eventsFileName ? eventsFileName : "events.dat"; plotFileName = plotFileName ? plotFileName : "events.plt"; @@ -396,8 +576,6 @@ // will clear flag so that no event will be logged timeStat::clearEventFlags(); } - - return; } void kmp_stats_output_module::setupEventColors() { @@ -410,28 +588,43 @@ globalColorIndex = (globalColorIndex + 1) % numGlobalColors; } } - return; } void kmp_stats_output_module::printTimerStats(FILE *statsOut, statistic const *theStats, statistic const *totalStats) { - fprintf(statsOut, "Timer, SampleCount, Min, " - "Mean, Max, Total, SD\n"); + fprintf(statsOut, + "Timer, SampleCount, Min, " + "Mean, Max, Total, SD\n"); for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) { statistic const *stat = &theStats[s]; char tag = timeStat::noUnits(s) ? ' ' : 'T'; - fprintf(statsOut, "%-28s, %s\n", timeStat::name(s), + fprintf(statsOut, "%-35s, %s\n", timeStat::name(s), stat->format(tag, true).c_str()); } // Also print the Total_ versions of times. for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) { char tag = timeStat::noUnits(s) ? ' ' : 'T'; if (totalStats && !timeStat::noTotal(s)) - fprintf(statsOut, "Total_%-22s, %s\n", timeStat::name(s), + fprintf(statsOut, "Total_%-29s, %s\n", timeStat::name(s), totalStats[s].format(tag, true).c_str()); } + + // Print historgram of statistics + if (theStats[0].haveHist()) { + fprintf(statsOut, "\nTimer distributions\n"); + for (int s = 0; s < TIMER_LAST; s++) { + statistic const *stat = &theStats[s]; + + if (stat->getCount() != 0) { + char tag = timeStat::noUnits(timer_e(s)) ? ' ' : 'T'; + + fprintf(statsOut, "%s\n", timeStat::name(timer_e(s))); + fprintf(statsOut, "%s\n", stat->getHist()->format(tag).c_str()); + } + } + } } void kmp_stats_output_module::printCounterStats(FILE *statsOut, @@ -443,6 +636,18 @@ fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(s)), stat->format(' ', true).c_str()); } + // Print histogram of counters + if (theStats[0].haveHist()) { + fprintf(statsOut, "\nCounter distributions\n"); + for (int s = 0; s < COUNTER_LAST; s++) { + statistic const *stat = &theStats[s]; + + if (stat->getCount() != 0) { + fprintf(statsOut, "%s\n", counter::name(counter_e(s))); + fprintf(statsOut, "%s\n", stat->getHist()->format(' ').c_str()); + } + } + } } void kmp_stats_output_module::printCounters(FILE *statsOut, @@ -480,9 +685,7 @@ for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) { kmp_stats_list *ptr = *it; ptr->getPartitionedTimers()->windup(); - for (int timer = 0; timer < EXPLICIT_TIMER_LAST; timer++) { - ptr->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer, ptr); - } + ptr->endLife(); } } @@ -547,6 +750,11 @@ return; } +static void outputEnvVariable(FILE *statsOut, char const *name) { + char const *value = getenv(name); + fprintf(statsOut, "# %s = %s\n", name, value ? value : "*unspecified*"); +} + /* Print some useful information about * the date and time this experiment ran. * the machine on which it ran. @@ -570,6 +778,11 @@ else fprintf(statsOut, "# Nominal frequency: %sz\n", formatSI(double(__kmp_cpuinfo.frequency), 9, 'H').c_str()); + outputEnvVariable(statsOut, "KMP_HW_SUBSET"); + outputEnvVariable(statsOut, "KMP_AFFINITY"); + outputEnvVariable(statsOut, "KMP_BLOCKTIME"); + outputEnvVariable(statsOut, "KMP_LIBRARY"); + fprintf(statsOut, "# Production runtime built " __DATE__ " " __TIME__ "\n"); #endif } @@ -665,7 +878,6 @@ for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) { timeStat *timers = (*it)->getTimers(); counter *counters = (*it)->getCounters(); - explicitTimer *eTimers = (*it)->getExplicitTimers(); for (int t = 0; t < TIMER_LAST; t++) timers[t].reset(); @@ -673,9 +885,6 @@ for (int c = 0; c < COUNTER_LAST; c++) counters[c].reset(); - for (int t = 0; t < EXPLICIT_TIMER_LAST; t++) - eTimers[t].reset(); - // reset the event vector so all previous events are "erased" (*it)->resetEventVector(); } Index: openmp/trunk/runtime/src/kmp_stats_timing.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_stats_timing.cpp +++ openmp/trunk/runtime/src/kmp_stats_timing.cpp @@ -95,10 +95,11 @@ static struct { double scale; char prefix; - } ranges[] = {{1.e12, 'f'}, {1.e9, 'p'}, {1.e6, 'n'}, {1.e3, 'u'}, - {1.0, 'm'}, {1.e-3, ' '}, {1.e-6, 'k'}, {1.e-9, 'M'}, - {1.e-12, 'G'}, {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'}, - {1.e-24, 'Z'}, {1.e-27, 'Y'}}; + } ranges[] = {{1.e21, 'y'}, {1.e18, 'z'}, {1.e15, 'a'}, {1.e12, 'f'}, + {1.e9, 'p'}, {1.e6, 'n'}, {1.e3, 'u'}, {1.0, 'm'}, + {1.e-3, ' '}, {1.e-6, 'k'}, {1.e-9, 'M'}, {1.e-12, 'G'}, + {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'}, {1.e-24, 'Z'}, + {1.e-27, 'Y'}}; if (interval == 0.0) { os << std::setw(width - 3) << std::right << "0.00" << std::setw(3) Index: openmp/trunk/runtime/src/z_Linux_util.cpp =================================================================== --- openmp/trunk/runtime/src/z_Linux_util.cpp +++ openmp/trunk/runtime/src/z_Linux_util.cpp @@ -510,7 +510,7 @@ #if KMP_STATS_ENABLED // set thread local index to point to thread-specific stats __kmp_stats_thread_ptr = ((kmp_info_t *)thr)->th.th_stats; - KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life); + __kmp_stats_thread_ptr->startLife(); KMP_SET_THREAD_STATE(IDLE); KMP_INIT_PARTITIONED_TIMERS(OMP_idle); #endif