diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1315,6 +1315,82 @@ #define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */ +// User-level Monitor/Mwait +#if KMP_HAVE_UMWAIT +// We always try for UMWAIT first +#if KMP_HAVE_WAITPKG_INTRINSICS +#if KMP_HAVE_IMMINTRIN_H +#include +#elif KMP_HAVE_INTRIN_H +#include +#endif +#endif // KMP_HAVE_WAITPKG_INTRINSICS + +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline int __kmp_tpause(uint32_t hint, uint64_t counter) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + uint32_t timeHi = uint32_t(counter >> 32); + uint32_t timeLo = uint32_t(counter & 0xffffffff); + char flag; + __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n" + "setb %0" + : "=r"(flag) + : "a"(timeLo), "d"(timeHi), "c"(hint) + :); + return flag; +#else + return _tpause(hint, counter); +#endif +} +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline void __kmp_umonitor(void *cacheline) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 " + : + : "a"(cacheline) + :); +#else + _umonitor(cacheline); +#endif +} +KMP_ATTRIBUTE_TARGET_WAITPKG +static inline int __kmp_umwait(uint32_t hint, uint64_t counter) { +#if !KMP_HAVE_WAITPKG_INTRINSICS + uint32_t timeHi = uint32_t(counter >> 32); + uint32_t timeLo = uint32_t(counter & 0xffffffff); + char flag; + __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n" + "setb %0" + : "=r"(flag) + : "a"(timeLo), "d"(timeHi), "c"(hint) + :); + return flag; +#else + return _umwait(hint, counter); +#endif +} +#elif KMP_HAVE_MWAIT +#if KMP_OS_UNIX +#include +#else +#include +#endif +#if KMP_OS_UNIX +__attribute__((target("sse3"))) +#endif +static inline void +__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) { + _mm_monitor(cacheline, extensions, hints); +} +#if KMP_OS_UNIX +__attribute__((target("sse3"))) +#endif +static inline void +__kmp_mm_mwait(unsigned extensions, unsigned hints) { + _mm_mwait(extensions, hints); +} +#endif // KMP_HAVE_UMWAIT + #if KMP_ARCH_X86 extern void __kmp_x86_pause(void); #elif KMP_MIC @@ -1344,6 +1420,9 @@ #define KMP_INIT_YIELD(count) \ { (count) = __kmp_yield_init; } +#define KMP_INIT_BACKOFF(time) \ + { (time) = __kmp_pause_init; } + #define KMP_OVERSUBSCRIBED \ (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) @@ -1381,7 +1460,36 @@ } \ } -#define KMP_YIELD_OVERSUB_ELSE_SPIN(count) \ +// If TPAUSE is available & enabled, use it. If oversubscribed, use the slower +// (C0.2) state, which improves performance of other SMT threads on the same +// core, otherwise, use the fast (C0.1) default state, or whatever the user has +// requested. Uses a timed TPAUSE, and exponential backoff. If TPAUSE isn't +// available, fall back to the regular CPU pause and yield combination. +#if KMP_HAVE_UMWAIT +#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \ + { \ + if (__kmp_tpause_enabled) { \ + if (KMP_OVERSUBSCRIBED) { \ + __kmp_tpause(0, (time)); \ + } else { \ + __kmp_tpause(__kmp_tpause_hint, (time)); \ + } \ + (time) *= 2; \ + } else { \ + KMP_CPU_PAUSE(); \ + if ((KMP_TRY_YIELD_OVERSUB)) { \ + __kmp_yield(); \ + } else if (__kmp_use_yield == 1) { \ + (count) -= 2; \ + if (!(count)) { \ + __kmp_yield(); \ + (count) = __kmp_yield_next; \ + } \ + } \ + } \ + } +#else +#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \ { \ KMP_CPU_PAUSE(); \ if ((KMP_TRY_YIELD_OVERSUB)) \ @@ -1394,80 +1502,6 @@ } \ } \ } - -// User-level Monitor/Mwait -#if KMP_HAVE_UMWAIT -// We always try for UMWAIT first -#if KMP_HAVE_WAITPKG_INTRINSICS -#if KMP_HAVE_IMMINTRIN_H -#include -#elif KMP_HAVE_INTRIN_H -#include -#endif -#endif // KMP_HAVE_WAITPKG_INTRINSICS -KMP_ATTRIBUTE_TARGET_WAITPKG -static inline int __kmp_tpause(uint32_t hint, uint64_t counter) { -#if !KMP_HAVE_WAITPKG_INTRINSICS - uint32_t timeHi = uint32_t(counter >> 32); - uint32_t timeLo = uint32_t(counter & 0xffffffff); - char flag; - __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n" - "setb %0" - : "=r"(flag) - : "a"(timeLo), "d"(timeHi), "c"(hint) - :); - return flag; -#else - return _tpause(hint, counter); -#endif -} -KMP_ATTRIBUTE_TARGET_WAITPKG -static inline void __kmp_umonitor(void *cacheline) { -#if !KMP_HAVE_WAITPKG_INTRINSICS - __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 " - : - : "a"(cacheline) - :); -#else - _umonitor(cacheline); -#endif -} -KMP_ATTRIBUTE_TARGET_WAITPKG -static inline int __kmp_umwait(uint32_t hint, uint64_t counter) { -#if !KMP_HAVE_WAITPKG_INTRINSICS - uint32_t timeHi = uint32_t(counter >> 32); - uint32_t timeLo = uint32_t(counter & 0xffffffff); - char flag; - __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n" - "setb %0" - : "=r"(flag) - : "a"(timeLo), "d"(timeHi), "c"(hint) - :); - return flag; -#else - return _umwait(hint, counter); -#endif -} -#elif KMP_HAVE_MWAIT -#if KMP_OS_UNIX -#include -#else -#include -#endif -#if KMP_OS_UNIX -__attribute__((target("sse3"))) -#endif -static inline void -__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) { - _mm_monitor(cacheline, extensions, hints); -} -#if KMP_OS_UNIX -__attribute__((target("sse3"))) -#endif -static inline void -__kmp_mm_mwait(unsigned extensions, unsigned hints) { - _mm_mwait(extensions, hints); -} #endif // KMP_HAVE_UMWAIT /* ------------------------------------------------------------------------ */ @@ -3088,6 +3122,7 @@ extern kmp_int32 __kmp_use_yield_exp_set; extern kmp_uint32 __kmp_yield_init; extern kmp_uint32 __kmp_yield_next; +extern kmp_uint64 __kmp_pause_init; /* ------------------------------------------------------------------------- */ extern int __kmp_allThreadsSpecified; @@ -3290,6 +3325,13 @@ extern int __kmp_mwait_hints; // Hints to pass in to mwait #endif +#if KMP_HAVE_UMWAIT +extern int __kmp_waitpkg_enabled; // Runtime check if waitpkg exists +extern int __kmp_tpause_state; // 0 (default), 1=C0.1, 2=C0.2; from KMP_TPAUSE +extern int __kmp_tpause_hint; // 1=C0.1 (default), 0=C0.2; from KMP_TPAUSE +extern int __kmp_tpause_enabled; // 0 (default), 1 (KMP_TPAUSE is non-zero) +#endif + /* ------------------------------------------------------------------------- */ extern kmp_global_t __kmp_global; /* global status */ diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h --- a/openmp/runtime/src/kmp_dispatch.h +++ b/openmp/runtime/src/kmp_dispatch.h @@ -292,10 +292,12 @@ UT check = checker; kmp_uint32 spins; kmp_uint32 (*f)(UT, UT) = pred; + kmp_uint64 time; UT r; KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(r = *spin, check)) { KMP_FSYNC_SPIN_PREPARE(obj); @@ -305,7 +307,7 @@ /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) __kmp_abort_thread(); */ // If oversubscribed, or have waited a bit then yield. - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); return r; diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -2655,9 +2655,11 @@ kmp_uint32 spins; kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; kmp_uint32 r; + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(r = TCR_4(*spin), check)) { KMP_FSYNC_SPIN_PREPARE(obj); @@ -2665,7 +2667,7 @@ split. It causes problems with infinite recursion because of exit lock */ /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) __kmp_abort_thread(); */ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); return r; @@ -2680,15 +2682,17 @@ kmp_uint32 check = checker; kmp_uint32 spins; kmp_uint32 (*f)(void *, kmp_uint32) = pred; + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(obj, spin); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(spin, check)) { KMP_FSYNC_SPIN_PREPARE(obj); /* if we have waited a bit, or are noversubscribed, yield */ /* pause is in the following code */ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); } diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -219,6 +219,13 @@ int __kmp_mwait_hints = 0; #endif +#if KMP_HAVE_UMWAIT +int __kmp_waitpkg_enabled = 0; +int __kmp_tpause_state = 0; +int __kmp_tpause_hint = 1; +int __kmp_tpause_enabled = 0; +#endif + /* map OMP 3.0 schedule types with our internal schedule types */ enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2] = { @@ -425,6 +432,7 @@ kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT; kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT; +kmp_uint64 __kmp_pause_init = 1; // for tpause /* ------------------------------------------------------ */ /* STATE mostly syncronized with global lock */ diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h --- a/openmp/runtime/src/kmp_lock.h +++ b/openmp/runtime/src/kmp_lock.h @@ -651,12 +651,15 @@ if (lck->tas.lk.poll != 0 || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \ kmp_uint32 spins; \ + kmp_uint64 time; \ KMP_FSYNC_PREPARE(lck); \ KMP_INIT_YIELD(spins); \ + KMP_INIT_BACKOFF(time); \ do { \ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \ - } while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq( \ - &lck->tas.lk.poll, 0, gtid + 1)); \ + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \ + } while ( \ + lck->tas.lk.poll != 0 || \ + !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \ } \ KMP_FSYNC_ACQUIRED(lck); \ } else { \ @@ -758,10 +761,12 @@ if ((lck->tas.lk.poll != 0) || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \ kmp_uint32 spins; \ + kmp_uint64 time; \ KMP_FSYNC_PREPARE(lck); \ KMP_INIT_YIELD(spins); \ + KMP_INIT_BACKOFF(time); \ do { \ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \ + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \ } while ( \ (lck->tas.lk.poll != 0) || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \ diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp --- a/openmp/runtime/src/kmp_lock.cpp +++ b/openmp/runtime/src/kmp_lock.cpp @@ -96,12 +96,19 @@ } kmp_uint32 spins; + kmp_uint64 time; KMP_FSYNC_PREPARE(lck); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); kmp_backoff_t backoff = __kmp_spin_backoff_params; do { +#if !KMP_HAVE_UMWAIT __kmp_spin_backoff(&backoff); - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); +#else + if (!__kmp_tpause_enabled) + __kmp_spin_backoff(&backoff); +#endif + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free || !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)); KMP_FSYNC_ACQUIRED(lck); @@ -2227,10 +2234,12 @@ // The current implementation of KMP_WAIT doesn't allow for mask // and poll to be re-read every spin iteration. kmp_uint32 spins; + kmp_uint64 time; KMP_FSYNC_PREPARE(lck); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); while (polls[ticket & mask] < ticket) { // atomic load - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); // Re-read the mask and the poll pointer from the lock structure. // // Make certain that "mask" is read before "polls" !!! @@ -2659,9 +2668,17 @@ kmp_uint32 i; for (i = boff->step; i > 0; i--) { kmp_uint64 goal = __kmp_tsc() + boff->min_tick; - do { - KMP_CPU_PAUSE(); - } while (before(__kmp_tsc(), goal)); +#if KMP_HAVE_UMWAIT + if (__kmp_umwait_enabled) { + __kmp_tpause(0, boff->min_tick); + } else { +#endif + do { + KMP_CPU_PAUSE(); + } while (before(__kmp_tsc(), goal)); +#if KMP_HAVE_UMWAIT + } +#endif } boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1); } diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -6895,7 +6895,9 @@ static void __kmp_user_level_mwait_init() { struct kmp_cpuid buf; __kmp_x86_cpuid(7, 0, &buf); - __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; + __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); + __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; + __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0); KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", __kmp_umwait_enabled)); } diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -5171,6 +5171,27 @@ #endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +#if KMP_HAVE_UMWAIT +// ----------------------------------------------------------------------------- +// KMP_TPAUSE +// 0 = don't use TPAUSE, 1 = use C0.1 state, 2 = use C0.2 state + +static void __kmp_stg_parse_tpause(char const *name, char const *value, + void *data) { + __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_tpause_state); + if (__kmp_tpause_state != 0) { + // The actual hint passed to tpause is: 0 for C0.2 and 1 for C0.1 + if (__kmp_tpause_state == 2) // use C0.2 + __kmp_tpause_hint = 0; // default was set to 1 for C0.1 + } +} // __kmp_stg_parse_tpause + +static void __kmp_stg_print_tpause(kmp_str_buf_t *buffer, char const *name, + void *data) { + __kmp_stg_print_int(buffer, name, __kmp_tpause_state); +} // __kmp_stg_print_tpause +#endif // KMP_HAVE_UMWAIT + // ----------------------------------------------------------------------------- // OMP_DISPLAY_ENV @@ -5536,6 +5557,10 @@ {"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints, __kmp_stg_print_mwait_hints, NULL, 0, 0}, #endif + +#if KMP_HAVE_UMWAIT + {"KMP_TPAUSE", __kmp_stg_parse_tpause, __kmp_stg_print_tpause, NULL, 0, 0}, +#endif {"", NULL, NULL, NULL, 0, 0}}; // settings static int const __kmp_stg_count = diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -3552,9 +3552,11 @@ void __kmp_wait_to_unref_task_teams(void) { kmp_info_t *thread; kmp_uint32 spins; + kmp_uint64 time; int done; KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); for (;;) { done = TRUE; @@ -3604,7 +3606,7 @@ } // If oversubscribed or have waited a bit, yield. - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } } diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h --- a/openmp/runtime/src/kmp_wait_release.h +++ b/openmp/runtime/src/kmp_wait_release.h @@ -377,6 +377,7 @@ #else kmp_uint32 hibernate; #endif + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(spin, NULL); if (flag->done_check()) { @@ -476,6 +477,7 @@ #endif KMP_INIT_YIELD(spins); // Setup for waiting + KMP_INIT_BACKOFF(time); if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || __kmp_pause_status == kmp_soft_paused) { @@ -563,7 +565,7 @@ // If we are oversubscribed, or have waited a bit (and // KMP_LIBRARY=throughput), then yield - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); #if KMP_STATS_ENABLED // Check if thread has been signalled to idle state diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp --- a/openmp/runtime/src/z_Windows_NT_util.cpp +++ b/openmp/runtime/src/z_Windows_NT_util.cpp @@ -1327,16 +1327,18 @@ // KMP_WAIT to cover this usage also. void *obj = NULL; kmp_uint32 spins; + kmp_uint64 time; #if USE_ITT_BUILD KMP_FSYNC_SPIN_INIT(obj, (void *)&th->th.th_info.ds.ds_alive); #endif /* USE_ITT_BUILD */ KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); do { #if USE_ITT_BUILD KMP_FSYNC_SPIN_PREPARE(obj); #endif /* USE_ITT_BUILD */ __kmp_is_thread_alive(th, &exit_val); - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } while (exit_val == STILL_ACTIVE && TCR_4(th->th.th_info.ds.ds_alive)); #if USE_ITT_BUILD if (exit_val == STILL_ACTIVE) {